From 9a48de86d892a3a1f028f4899f34898e58d759b3 Mon Sep 17 00:00:00 2001 From: Jeff Schumacher Date: Fri, 9 Jul 2010 15:11:54 -0700 Subject: [PATCH] Added file path similarity to scoring metric in rename detection The scoring method was not taking into account the similarity of the file paths and file names. I changed the metric so that it is 99% based on content (which used to be 100% of the old metric), and 1% based on path similarity. Of that 1%, half (.5% of the total final score) is based on the actual file names (e.g. "foo.java"), and half on the directory (e.g. "src/com/foo/bar/"). Change-Id: I94f0c23bf6413c491b10d5625f6ad7d2ecfb4def --- .../eclipse/jgit/diff/RenameDetectorTest.java | 12 ++-- .../jgit/diff/SimilarityIndexTest.java | 8 +-- .../eclipse/jgit/diff/SimilarityIndex.java | 6 +- .../jgit/diff/SimilarityRenameDetector.java | 56 ++++++++++++++++++- 4 files changed, 68 insertions(+), 14 deletions(-) diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java index c4cb600db..fe0c565d6 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java @@ -124,8 +124,8 @@ public class RenameDetectorTest extends RepositoryTestCase { } public void testInexactRename_OnePair() throws Exception { - ObjectId aId = blob("foo\nbar\nbaz\n"); - ObjectId bId = blob("foo\nbar\nblah\n"); + ObjectId aId = blob("foo\nbar\nbaz\nblarg\n"); + ObjectId bId = blob("foo\nbar\nbaz\nblah\n"); DiffEntry a = DiffEntry.add(PATH_A, aId); DiffEntry b = DiffEntry.delete(PATH_Q, bId); @@ -135,12 +135,12 @@ public class RenameDetectorTest extends RepositoryTestCase { List entries = rd.compute(); assertEquals(1, entries.size()); - assertRename(b, a, 61, entries.get(0)); + assertRename(b, a, 66, entries.get(0)); } public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception { - ObjectId aId = blob("foo\nbar\nbaz\n"); - ObjectId bId = blob("foo\nbar\nblah\n"); + ObjectId aId = blob("foo\nbar\nbaz\nblarg\n"); + ObjectId bId = blob("foo\nbar\nbaz\nblah\n"); DiffEntry a = DiffEntry.add(PATH_A, aId); DiffEntry b = DiffEntry.delete(PATH_Q, bId); @@ -158,7 +158,7 @@ public class RenameDetectorTest extends RepositoryTestCase { assertEquals(3, entries.size()); assertSame(c, entries.get(0)); assertSame(d, entries.get(1)); - assertRename(b, a, 61, entries.get(2)); + assertRename(b, a, 66, entries.get(2)); } public void testInexactRename_LastByteDifferent() throws Exception { diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java index 9ab745fac..d6915eb87 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java @@ -78,8 +78,8 @@ public class SimilarityIndexTest extends TestCase { assertEquals(8, src.common(dst)); assertEquals(8, dst.common(src)); - assertEquals(100, src.score(dst)); - assertEquals(100, dst.score(src)); + assertEquals(100, src.score(dst, 100)); + assertEquals(100, dst.score(src, 100)); } public void testCommonScore_EmptyFiles() { @@ -102,8 +102,8 @@ public class SimilarityIndexTest extends TestCase { assertEquals(6, src.common(dst)); assertEquals(6, dst.common(src)); - assertEquals(75, src.score(dst)); - assertEquals(75, dst.score(src)); + assertEquals(75, src.score(dst, 100)); + assertEquals(75, dst.score(src, 100)); } private static SimilarityIndex hash(String text) { diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java index f4cccfc37..d5a31d604 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java @@ -142,11 +142,11 @@ class SimilarityIndex { Arrays.sort(idHash); } - int score(SimilarityIndex dst) { + int score(SimilarityIndex dst, int maxScore) { long max = Math.max(fileSize, dst.fileSize); if (max == 0) - return 100; - return (int) ((common(dst) * 100L) / max); + return maxScore; + return (int) ((common(dst) * maxScore) / max); } int common(SimilarityIndex dst) { diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java index a343fc062..6590f746f 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java @@ -260,7 +260,14 @@ class SimilarityRenameDetector { } SimilarityIndex d = hash(dstEnt.newId.toObjectId()); - int score = s.score(d); + int contentScore = s.score(d, 10000); + + // nameScore returns a value between 0 and 100, but we want it + // to be in the same range as the content score. This allows it + // to be dropped into the pretty formula for the final score. + int nameScore = nameScore(srcEnt.oldName, dstEnt.newName) * 100; + + int score = (contentScore * 99 + nameScore * 1) / 10000; if (score < renameScore) { pm.update(1); @@ -280,6 +287,53 @@ class SimilarityRenameDetector { return mNext; } + private int nameScore(String a, String b) { + int aDirLen = a.lastIndexOf("/") + 1; + int bDirLen = b.lastIndexOf("/") + 1; + + int dirMin = Math.min(aDirLen, bDirLen); + int dirMax = Math.max(aDirLen, bDirLen); + + final int dirScoreLtr; + final int dirScoreRtl; + + if (dirMax == 0) { + dirScoreLtr = 100; + dirScoreRtl = 100; + } else { + int dirSim = 0; + for (; dirSim < dirMin; dirSim++) { + if (a.charAt(dirSim) != b.charAt(dirSim)) + break; + } + dirScoreLtr = (dirSim * 100) / dirMax; + + if (dirScoreLtr == 100) { + dirScoreRtl = 100; + } else { + for (dirSim = 0; dirSim < dirMin; dirSim++) { + if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1 + - dirSim)) + break; + } + dirScoreRtl = (dirSim * 100) / dirMax; + } + } + + int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen); + int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen); + + int fileSim = 0; + for (; fileSim < fileMin; fileSim++) { + if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1 + - fileSim)) + break; + } + int fileScore = (fileSim * 100) / fileMax; + + return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100; + } + private SimilarityIndex hash(ObjectId objectId) throws IOException { SimilarityIndex r = new SimilarityIndex(); r.hash(repo.openObject(objectId));