From 31311cacfd311f9a78bdf012a9f6ba1907b9964a Mon Sep 17 00:00:00 2001 From: Jeff Schumacher Date: Tue, 13 Jul 2010 12:21:12 -0700 Subject: [PATCH] Implemented file path based tie breaking to exact rename detection During the exact rename detection phase in RenameDetector, ties were resolved on a first-found basis. I added support for file path based tie breaking during that phase. Basically, there are four situations that have to be handled: One add matching one delete: In this simple case, we pair them as a rename. One add matching many deletes: Find the delete whos path matches the add the closest, and pair them as a rename. Many adds matching one delete: Similar to the above case, we find the add that matches the delete the closest, and pair them as a rename. The other adds are marked as copies of the delete. Many adds matching many deletes: Build a scoring matrix similar to the one used for content- based matching, scoring instead by file path. Some of the utility functions in SimilarityRenameDetector are used in this case, as we use the same encoding scheme. Once the matrix is built, scan it for the best matches, marking them as renames. The rest are marked as copies. I don't particularly like the idea of using utility functions right out of SimilarityRenameDetector, but it works for the moment. A later commit will likely refactor this into a common utility class, as well as bringing exact rename detection out of RenameDetector and into a separate class, much like SimilarityRenameDetector. Change-Id: I1fb08390aebdcbf20d049aecf402a36506e55611 --- .../eclipse/jgit/diff/RenameDetectorTest.java | 58 +++++ .../src/org/eclipse/jgit/diff/DiffEntry.java | 27 +++ .../org/eclipse/jgit/diff/RenameDetector.java | 211 ++++++++++++++---- .../jgit/diff/SimilarityRenameDetector.java | 8 +- 4 files changed, 257 insertions(+), 47 deletions(-) diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java index 86b40e2f4..c32c78668 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java @@ -146,6 +146,48 @@ public class RenameDetectorTest extends RepositoryTestCase { assertRename(a, d, 100, entries.get(2)); } + public void testExactRename_PathBreaksTie() throws Exception { + ObjectId foo = blob("foo"); + + DiffEntry a = DiffEntry.add("src/com/foo/a.java", foo); + DiffEntry b = DiffEntry.delete("src/com/foo/b.java", foo); + + DiffEntry c = DiffEntry.add("c.txt", foo); + DiffEntry d = DiffEntry.delete("d.txt", foo); + + // Add out of order to avoid first-match succeeding + rd.add(a); + rd.add(d); + rd.add(b); + rd.add(c); + + List entries = rd.compute(); + assertEquals(2, entries.size()); + assertRename(d, c, 100, entries.get(0)); + assertRename(b, a, 100, entries.get(1)); + } + + public void testExactRename_OneDeleteManyAdds() throws Exception { + ObjectId foo = blob("foo"); + + DiffEntry a = DiffEntry.add("src/com/foo/a.java", foo); + DiffEntry b = DiffEntry.add("src/com/foo/b.java", foo); + DiffEntry c = DiffEntry.add("c.txt", foo); + + DiffEntry d = DiffEntry.delete("d.txt", foo); + + rd.add(a); + rd.add(b); + rd.add(c); + rd.add(d); + + List entries = rd.compute(); + assertEquals(3, entries.size()); + assertRename(d, c, 100, entries.get(0)); + assertCopy(d, a, 100, entries.get(1)); + assertCopy(d, b, 100, entries.get(2)); + } + public void testInexactRename_OnePair() throws Exception { ObjectId aId = blob("foo\nbar\nbaz\nblarg\n"); ObjectId bId = blob("foo\nbar\nbaz\nblah\n"); @@ -385,4 +427,20 @@ public class RenameDetectorTest extends RepositoryTestCase { assertEquals(score, rename.getScore()); } + + private static void assertCopy(DiffEntry o, DiffEntry n, int score, + DiffEntry copy) { + assertEquals(ChangeType.COPY, copy.getChangeType()); + + assertEquals(o.getOldName(), copy.getOldName()); + assertEquals(n.getNewName(), copy.getNewName()); + + assertEquals(o.getOldMode(), copy.getOldMode()); + assertEquals(n.getNewMode(), copy.getNewMode()); + + assertEquals(o.getOldId(), copy.getOldId()); + assertEquals(n.getNewId(), copy.getNewId()); + + assertEquals(score, copy.getScore()); + } } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java index 4d25dbd02..304e4cff3 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java @@ -315,4 +315,31 @@ public class DiffEntry { public AbbreviatedObjectId getNewId() { return newId; } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append("DiffEntry["); + buf.append(changeType); + buf.append(" "); + switch (changeType) { + case ADD: + buf.append(newName); + break; + case COPY: + buf.append(oldName + "->" + newName); + break; + case DELETE: + buf.append(oldName); + break; + case MODIFY: + buf.append(oldName); + break; + case RENAME: + buf.append(oldName + "->" + newName); + break; + } + buf.append("]"); + return buf.toString(); + } } \ No newline at end of file diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java index 5bb63c4dd..bc2394053 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java @@ -45,6 +45,7 @@ package org.eclipse.jgit.diff; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; @@ -312,70 +313,131 @@ public class RenameDetector { return; pm.beginTask(JGitText.get().renamesFindingExact, // - added.size() + deleted.size()); + added.size() + added.size() + deleted.size() + + added.size() * deleted.size()); - HashMap map = new HashMap(); - for (DiffEntry del : deleted) { - Object old = map.put(del.oldId, del); - if (old instanceof DiffEntry) { - ArrayList list = new ArrayList(2); - list.add((DiffEntry) old); - list.add(del); - map.put(del.oldId, list); + HashMap deletedMap = populateMap(deleted, pm); + HashMap addedMap = populateMap(added, pm); - } else if (old != null) { - // Must be a list of DiffEntries - ((List) old).add(del); - map.put(del.oldId, old); - } - pm.update(1); + ArrayList uniqueAdds = new ArrayList(added.size()); + ArrayList> nonUniqueAdds = new ArrayList>(); + + for (Object o : addedMap.values()) { + if (o instanceof DiffEntry) + uniqueAdds.add((DiffEntry) o); + else + nonUniqueAdds.add((List) o); } ArrayList left = new ArrayList(added.size()); - for (DiffEntry dst : added) { - Object del = map.get(dst.newId); + + for (DiffEntry a : uniqueAdds) { + Object del = deletedMap.get(a.newId); if (del instanceof DiffEntry) { + // We have one add to one delete: pair them if they are the same + // type DiffEntry e = (DiffEntry) del; - if (sameType(e.oldMode, dst.newMode)) { - if (e.changeType == ChangeType.DELETE) { - e.changeType = ChangeType.RENAME; - entries.add(exactRename(e, dst)); - } else { - entries.add(exactCopy(e, dst)); - } + if (sameType(e.oldMode, a.newMode)) { + e.changeType = ChangeType.RENAME; + entries.add(exactRename(e, a)); } else { - left.add(dst); + left.add(a); } - } else if (del != null) { + // We have one add to many deletes: find the delete with the + // same type and closest name to the add, then pair them List list = (List) del; - DiffEntry best = null; - for (DiffEntry e : list) { - if (sameType(e.oldMode, dst.newMode)) { - best = e; - break; - } + DiffEntry best = bestPathMatch(a, list); + if (best != null) { + best.changeType = ChangeType.RENAME; + entries.add(exactRename(best, a)); + } else { + left.add(a); } + } else { + left.add(a); + } + pm.update(1); + } + + for (List adds : nonUniqueAdds) { + Object o = deletedMap.get(adds.get(0).newId); + if (o instanceof DiffEntry) { + // We have many adds to one delete: find the add with the same + // type and closest name to the delete, then pair them. Mark the + // rest as copies of the delete. + DiffEntry d = (DiffEntry) o; + DiffEntry best = bestPathMatch(d, adds); if (best != null) { - if (best.changeType == ChangeType.DELETE) { - best.changeType = ChangeType.RENAME; - entries.add(exactRename(best, dst)); - } else { - entries.add(exactCopy(best, dst)); + d.changeType = ChangeType.RENAME; + entries.add(exactRename(d, best)); + for (DiffEntry a : adds) { + if (a != best) { + if (sameType(d.oldMode, a.newMode)) { + entries.add(exactCopy(d, a)); + } else { + left.add(a); + } + } } } else { - left.add(dst); + left.addAll(adds); } - } else { - left.add(dst); + // We have many adds to many deletes: score all the adds against + // all the deletes by path name, take the best matches, pair + // them as renames, then call the rest copies + List dels = (List) o; + long[] matrix = new long[dels.size() * adds.size()]; + int mNext = 0; + for (int addIdx = 0; addIdx < adds.size(); addIdx++) { + String addedName = adds.get(addIdx).newName; + + for (int delIdx = 0; delIdx < dels.size(); delIdx++) { + String deletedName = dels.get(delIdx).oldName; + + int score = SimilarityRenameDetector.nameScore(addedName, deletedName); + matrix[mNext] = SimilarityRenameDetector.encode(score, addIdx, delIdx); + mNext++; + } + } + + Arrays.sort(matrix); + + for (--mNext; mNext >= 0; mNext--) { + long ent = matrix[mNext]; + int delIdx = SimilarityRenameDetector.srcFile(ent); + int addIdx = SimilarityRenameDetector.dstFile(ent); + DiffEntry d = dels.get(delIdx); + DiffEntry a = adds.get(addIdx); + + if (a == null) { + pm.update(1); + continue; // was already matched earlier + } + + ChangeType type; + if (d.changeType == ChangeType.DELETE) { + // First use of this source file. Tag it as a rename so we + // later know it is already been used as a rename, other + // matches (if any) will claim themselves as copies instead. + // + d.changeType = ChangeType.RENAME; + type = ChangeType.RENAME; + } else { + type = ChangeType.COPY; + } + + entries.add(DiffEntry.pair(type, d, a, 100)); + adds.set(addIdx, null); // Claim the destination was matched. + pm.update(1); + } } - pm.update(1); } added = left; - deleted = new ArrayList(map.size()); - for (Object o : map.values()) { + deleted = new ArrayList(deletedMap.size()); + for (Object o : deletedMap.values()) { if (o instanceof DiffEntry) { DiffEntry e = (DiffEntry) o; if (e.changeType == ChangeType.DELETE) @@ -391,6 +453,69 @@ public class RenameDetector { pm.endTask(); } + /** + * Find the best match by file path for a given DiffEntry from a list of + * DiffEntrys. The returned DiffEntry will be of the same type as . If + * no DiffEntry can be found that has the same type, this method will return + * null. + * + * @param src + * the DiffEntry to try to find a match for + * @param list + * a list of DiffEntrys to search through + * @return the DiffEntry from who's file path best matches + */ + private static DiffEntry bestPathMatch(DiffEntry src, List list) { + DiffEntry best = null; + int score = -1; + + for (DiffEntry d : list) { + if (sameType(mode(d), mode(src))) { + int tmp = SimilarityRenameDetector + .nameScore(path(d), path(src)); + if (tmp > score) { + best = d; + score = tmp; + } + } + } + + return best; + } + + @SuppressWarnings("unchecked") + private HashMap populateMap( + List diffEntries, ProgressMonitor pm) { + HashMap map = new HashMap(); + for (DiffEntry de : diffEntries) { + Object old = map.put(id(de), de); + if (old instanceof DiffEntry) { + ArrayList list = new ArrayList(2); + list.add((DiffEntry) old); + list.add(de); + map.put(id(de), list); + } else if (old != null) { + // Must be a list of DiffEntries + ((List) old).add(de); + map.put(id(de), old); + } + pm.update(1); + } + return map; + } + + private static String path(DiffEntry de) { + return de.changeType == ChangeType.DELETE ? de.oldName : de.newName; + } + + private static FileMode mode(DiffEntry de) { + return de.changeType == ChangeType.DELETE ? de.oldMode : de.newMode; + } + + private static AbbreviatedObjectId id(DiffEntry de) { + return de.changeType == ChangeType.DELETE ? de.oldId : de.newId; + } + static boolean sameType(FileMode a, FileMode b) { // Files have to be of the same type in order to rename them. // We would never want to rename a file to a gitlink, or a diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java index 6590f746f..e2115f0ac 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java @@ -287,7 +287,7 @@ class SimilarityRenameDetector { return mNext; } - private int nameScore(String a, String b) { + static int nameScore(String a, String b) { int aDirLen = a.lastIndexOf("/") + 1; int bDirLen = b.lastIndexOf("/") + 1; @@ -349,15 +349,15 @@ class SimilarityRenameDetector { return (int) (value >>> SCORE_SHIFT); } - private static int srcFile(long value) { + static int srcFile(long value) { return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK); } - private static int dstFile(long value) { + static int dstFile(long value) { return decodeFile(((int) value) & INDEX_MASK); } - private static long encode(int score, int srcIdx, int dstIdx) { + static long encode(int score, int srcIdx, int dstIdx) { return (((long) score) << SCORE_SHIFT) // | (encodeFile(srcIdx) << BITS_PER_INDEX) // | encodeFile(dstIdx);