From fd527a2cd7ec14b8ea592a3a41d60367924a6f9f Mon Sep 17 00:00:00 2001 From: Mike Williams Date: Fri, 17 Jun 2016 11:34:36 -0400 Subject: [PATCH] Prune UNREACHABLE_GARBAGE packs when they expire DfsGarbageCollector will now enforce a maximum time to live (TTL) for UNREACHABLE_GARBAGE packs. The default TTL is 1 day, which should be enough time to avoid races with other processes that are inserting data into the repository. Change-Id: Id719e6e2a03cfc9a0c0aef8ed71d261dda14bd0c Signed-off-by: Mike Williams --- .../storage/dfs/DfsGarbageCollectorTest.java | 239 ++++++++++++++++++ .../storage/dfs/DfsGarbageCollector.java | 117 ++++++++- 2 files changed, 345 insertions(+), 11 deletions(-) create mode 100644 org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollectorTest.java diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollectorTest.java new file mode 100644 index 000000000..c7125bb9f --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollectorTest.java @@ -0,0 +1,239 @@ +package org.eclipse.jgit.internal.storage.dfs; + +import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.GC; +import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.INSERT; +import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.UNREACHABLE_GARBAGE; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +import org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource; +import org.eclipse.jgit.junit.TestRepository; +import org.eclipse.jgit.lib.AnyObjectId; +import org.eclipse.jgit.lib.Ref; +import org.eclipse.jgit.lib.Repository; +import org.eclipse.jgit.revwalk.RevCommit; +import org.eclipse.jgit.revwalk.RevWalk; +import org.junit.Before; +import org.junit.Test; + +public class DfsGarbageCollectorTest { + private TestRepository git; + private InMemoryRepository repo; + private DfsObjDatabase odb; + + @Before + public void setUp() throws IOException { + DfsRepositoryDescription desc = new DfsRepositoryDescription("test"); + git = new TestRepository<>(new InMemoryRepository(desc)); + repo = git.getRepository(); + odb = repo.getObjectDatabase(); + } + + @Test + public void testCollectionWithNoGarbage() throws Exception { + RevCommit commit0 = commit().message("0").create(); + RevCommit commit1 = commit().message("1").parent(commit0).create(); + git.update("master", commit1); + + assertTrue("commit0 reachable", isReachable(repo, commit0)); + assertTrue("commit1 reachable", isReachable(repo, commit1)); + + // Packs start out as INSERT. + assertEquals(2, odb.getPacks().length); + for (DfsPackFile pack : odb.getPacks()) { + assertEquals(INSERT, pack.getPackDescription().getPackSource()); + } + + gcNoTtl(); + + // Single GC pack present with all objects. + assertEquals(1, odb.getPacks().length); + DfsPackFile pack = odb.getPacks()[0]; + assertEquals(GC, pack.getPackDescription().getPackSource()); + assertTrue("commit0 in pack", isObjectInPack(commit0, pack)); + assertTrue("commit1 in pack", isObjectInPack(commit1, pack)); + } + + @Test + public void testCollectionWithGarbage() throws Exception { + RevCommit commit0 = commit().message("0").create(); + RevCommit commit1 = commit().message("1").parent(commit0).create(); + git.update("master", commit0); + + assertTrue("commit0 reachable", isReachable(repo, commit0)); + assertFalse("commit1 garbage", isReachable(repo, commit1)); + gcNoTtl(); + + assertEquals(2, odb.getPacks().length); + DfsPackFile gc = null; + DfsPackFile garbage = null; + for (DfsPackFile pack : odb.getPacks()) { + DfsPackDescription d = pack.getPackDescription(); + if (d.getPackSource() == GC) { + gc = pack; + } else if (d.getPackSource() == UNREACHABLE_GARBAGE) { + garbage = pack; + } else { + fail("unexpected " + d.getPackSource()); + } + } + + assertNotNull("created GC pack", gc); + assertTrue(isObjectInPack(commit0, gc)); + + assertNotNull("created UNREACHABLE_GARBAGE pack", garbage); + assertTrue(isObjectInPack(commit1, garbage)); + } + + @Test + public void testCollectionWithGarbageAndGarbagePacksPurged() + throws Exception { + RevCommit commit0 = commit().message("0").create(); + RevCommit commit1 = commit().message("1").parent(commit0).create(); + git.update("master", commit0); + + gcNoTtl(); + gcWithTtl(); + + // The repository has an UNREACHABLE_GARBAGE pack that could have + // expired, but since we never purge the most recent UNREACHABLE_GARBAGE + // pack, it must have survived the GC. + boolean commit1Found = false; + for (DfsPackFile pack : odb.getPacks()) { + DfsPackDescription d = pack.getPackDescription(); + if (d.getPackSource() == GC) { + assertTrue("has commit0", isObjectInPack(commit0, pack)); + assertFalse("no commit1", isObjectInPack(commit1, pack)); + } else if (d.getPackSource() == UNREACHABLE_GARBAGE) { + commit1Found |= isObjectInPack(commit1, pack); + } else { + fail("unexpected " + d.getPackSource()); + } + } + assertTrue("garbage commit1 still readable", commit1Found); + + // Find oldest UNREACHABLE_GARBAGE; it will be pruned by next GC. + DfsPackDescription oldestGarbagePack = null; + for (DfsPackFile pack : odb.getPacks()) { + DfsPackDescription d = pack.getPackDescription(); + if (d.getPackSource() == UNREACHABLE_GARBAGE) { + oldestGarbagePack = oldestPack(oldestGarbagePack, d); + } + } + assertNotNull("has UNREACHABLE_GARBAGE", oldestGarbagePack); + + gcWithTtl(); + assertTrue("has packs", odb.getPacks().length > 0); + for (DfsPackFile pack : odb.getPacks()) { + assertNotEquals(oldestGarbagePack, pack.getPackDescription()); + } + } + + @Test + public void testCollectionWithGarbageCoalescence() throws Exception { + RevCommit commit0 = commit().message("0").create(); + RevCommit commit1 = commit().message("1").parent(commit0).create(); + git.update("master", commit0); + + for (int i = 0; i < 3; i++) { + commit1 = commit().message("g" + i).parent(commit1).create(); + + // Make sure we don't have more than 1 UNREACHABLE_GARBAGE pack + // because they're coalesced. + gcNoTtl(); + assertEquals(1, countPacks(UNREACHABLE_GARBAGE)); + } + } + + @Test + public void testCollectionWithGarbageNoCoalescence() throws Exception { + RevCommit commit0 = commit().message("0").create(); + RevCommit commit1 = commit().message("1").parent(commit0).create(); + git.update("master", commit0); + + for (int i = 0; i < 3; i++) { + commit1 = commit().message("g" + i).parent(commit1).create(); + + DfsGarbageCollector gc = new DfsGarbageCollector(repo); + gc.setCoalesceGarbageLimit(0); + gc.setGarbageTtl(0, TimeUnit.MILLISECONDS); + run(gc); + assertEquals(1 + i, countPacks(UNREACHABLE_GARBAGE)); + } + } + + private TestRepository.CommitBuilder commit() { + return git.commit(); + } + + private void gcNoTtl() throws IOException { + DfsGarbageCollector gc = new DfsGarbageCollector(repo); + gc.setGarbageTtl(0, TimeUnit.MILLISECONDS); // disable TTL + run(gc); + } + + private void gcWithTtl() throws InterruptedException, IOException { + // Wait for the system clock to move by at least 1 millisecond. + // This allows the DfsGarbageCollector to recognize the boundary. + long start = System.currentTimeMillis(); + do { + Thread.sleep(10); + } while (System.currentTimeMillis() <= start); + + DfsGarbageCollector gc = new DfsGarbageCollector(repo); + gc.setGarbageTtl(1, TimeUnit.MILLISECONDS); + run(gc); + } + + private void run(DfsGarbageCollector gc) throws IOException { + assertTrue("gc repacked", gc.pack(null)); + odb.clearCache(); + } + + private static boolean isReachable(Repository repo, AnyObjectId id) + throws IOException { + try (RevWalk rw = new RevWalk(repo)) { + for (Ref ref : repo.getAllRefs().values()) { + rw.markStart(rw.parseCommit(ref.getObjectId())); + } + for (RevCommit next; (next = rw.next()) != null;) { + if (AnyObjectId.equals(next, id)) { + return true; + } + } + } + return false; + } + + private boolean isObjectInPack(AnyObjectId id, DfsPackFile pack) + throws IOException { + try (DfsReader reader = new DfsReader(odb)) { + return pack.hasObject(reader, id); + } + } + + private static DfsPackDescription oldestPack(DfsPackDescription a, + DfsPackDescription b) { + if (a != null && a.getLastModified() < b.getLastModified()) { + return a; + } + return b; + } + + private int countPacks(PackSource source) throws IOException { + int cnt = 0; + for (DfsPackFile pack : odb.getPacks()) { + if (pack.getPackDescription().getPackSource() == source) { + cnt++; + } + } + return cnt; + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollector.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollector.java index 2883f4ecc..55b2d2f12 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollector.java @@ -54,9 +54,11 @@ import static org.eclipse.jgit.internal.storage.pack.PackExt.PACK; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.eclipse.jgit.internal.JGitText; import org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource; @@ -93,10 +95,14 @@ public class DfsGarbageCollector { private PackConfig packConfig; + // See pack(), below, for how these two variables interact. private long coalesceGarbageLimit = 50 << 20; + private long garbageTtlMillis = TimeUnit.DAYS.toMillis(1); private long startTimeMillis; private List packsBefore; + private List expiredGarbagePacks; + private Set allHeads; private Set nonHeads; private Set txnHeads; @@ -167,6 +173,34 @@ public class DfsGarbageCollector { return this; } + /** + * @return garbage packs older than this limit (in milliseconds) will be + * pruned as part of the garbage collection process if the value is + * > 0, otherwise garbage packs are retained. + */ + public long getGarbageTtlMillis() { + return garbageTtlMillis; + } + + /** + * Set the time to live for garbage objects. + *

+ * Any UNREACHABLE_GARBAGE older than this limit will be pruned at the end + * of the run. + *

+ * If timeToLiveMillis is set to 0, UNREACHABLE_GARBAGE purging is disabled. + * + * @param ttl + * Time to live whatever unit is specified. + * @param unit + * The specified time unit. + * @return {@code this} + */ + public DfsGarbageCollector setGarbageTtl(long ttl, TimeUnit unit) { + garbageTtlMillis = unit.toMillis(ttl); + return this; + } + /** * Create a single new pack file containing all of the live objects. *

@@ -189,6 +223,12 @@ public class DfsGarbageCollector { if (packConfig.getIndexVersion() != 2) throw new IllegalStateException( JGitText.get().supportOnlyPackIndexVersion2); + if (garbageTtlMillis > 0) { + // We disable coalescing because the coalescing step will keep + // refreshing the UNREACHABLE_GARBAGE pack and we wouldn't + // actually prune anything. + coalesceGarbageLimit = 0; + } startTimeMillis = System.currentTimeMillis(); ctx = (DfsReader) objdb.newReader(); @@ -197,9 +237,14 @@ public class DfsGarbageCollector { objdb.clearCache(); Collection refsBefore = getAllRefs(); - packsBefore = packsToRebuild(); - if (packsBefore.isEmpty()) + readPacksBefore(); + + if (packsBefore.isEmpty()) { + if (!expiredGarbagePacks.isEmpty()) { + objdb.commitPack(noPacks(), toPrune()); + } return true; + } allHeads = new HashSet(); nonHeads = new HashSet(); @@ -254,17 +299,60 @@ public class DfsGarbageCollector { return refs; } - private List packsToRebuild() throws IOException { + private void readPacksBefore() throws IOException { DfsPackFile[] packs = objdb.getPacks(); - List out = new ArrayList(packs.length); + packsBefore = new ArrayList(packs.length); + expiredGarbagePacks = new ArrayList(packs.length); + + long mostRecentGC = mostRecentGC(packs); + long now = System.currentTimeMillis(); + for (DfsPackFile p : packs) { + DfsPackDescription d = p.getPackDescription(); + if (d.getPackSource() != UNREACHABLE_GARBAGE) { + packsBefore.add(p); + } else if (packIsExpiredGarbage(d, mostRecentGC, now)) { + expiredGarbagePacks.add(p); + } else if (d.getFileSize(PackExt.PACK) < coalesceGarbageLimit) { + packsBefore.add(p); + } + } + } + + private static long mostRecentGC(DfsPackFile[] packs) { + long r = 0; for (DfsPackFile p : packs) { DfsPackDescription d = p.getPackDescription(); - if (d.getPackSource() != UNREACHABLE_GARBAGE) - out.add(p); - else if (d.getFileSize(PackExt.PACK) < coalesceGarbageLimit) - out.add(p); + if (d.getPackSource() == GC || d.getPackSource() == GC_REST) { + r = Math.max(r, d.getLastModified()); + } } - return out; + return r; + } + + private boolean packIsExpiredGarbage(DfsPackDescription d, + long mostRecentGC, long now) { + // It should be safe to remove an UNREACHABLE_GARBAGE pack if it: + // + // (a) Predates the most recent prior run of this class. This check + // ensures the graph traversal algorithm had a chance to consider + // all objects in this pack and copied them into a GC or GC_REST + // pack if the graph contained live edges to the objects. + // + // This check is safe because of the ordering of packing; the GC + // packs are written first and then the UNREACHABLE_GARBAGE is + // constructed. Any UNREACHABLE_GARBAGE dated earlier than the GC + // was input to the prior GC's graph traversal. + // + // (b) Is older than garbagePackTtl. This check gives concurrent + // inserter threads sufficient time to identify an object is not + // in the graph and should have a new copy written, rather than + // relying on something from an UNREACHABLE_GARBAGE pack. + // + // Both (a) and (b) must be met to safely remove UNREACHABLE_GARBAGE. + return d.getPackSource() == UNREACHABLE_GARBAGE + && d.getLastModified() < mostRecentGC + && garbageTtlMillis > 0 + && now - d.getLastModified() >= garbageTtlMillis; } /** @return all of the source packs that fed into this compaction. */ @@ -285,8 +373,12 @@ public class DfsGarbageCollector { private List toPrune() { int cnt = packsBefore.size(); List all = new ArrayList(cnt); - for (DfsPackFile pack : packsBefore) + for (DfsPackFile pack : packsBefore) { + all.add(pack.getPackDescription()); + } + for (DfsPackFile pack : expiredGarbagePacks) { all.add(pack.getPackDescription()); + } return all; } @@ -329,7 +421,6 @@ public class DfsGarbageCollector { } private void packGarbage(ProgressMonitor pm) throws IOException { - // TODO(sop) This is ugly. The garbage pack needs to be deleted. PackConfig cfg = new PackConfig(packConfig); cfg.setReuseDeltas(true); cfg.setReuseObjects(true); @@ -420,4 +511,8 @@ public class DfsGarbageCollector { DfsBlockCache.getInstance().getOrCreate(pack, null); return pack; } + + private static List noPacks() { + return Collections.emptyList(); + } }