Browse Source

Break dissimilar file pairs during diff

File pairs that are very dissimilar during a diff were not being
broken apart into their constituent ADD/DELETE pairs. The leads to
sub-optimal rename detection. Take, for example, this situation:

A file exists at src/a.txt containing "foo". A user renames src/a.txt
to src/b.txt, then adds a new src/a.txt containing "bar".

Even though the old a.txt and the new b.txt are identical, the
rename detection algorithm would not detect it as a rename since
it was already paired in a MODIFY. I added code to split all
MODIFYs below a certain score into their constituent ADD/DELETE
pairs. This allows situations like the one I described above to be
more correctly handled.

Change-Id: I22c04b70581f206bbc68c4cd1ee87a1f663b418e
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
stable-0.9
Jeff Schumacher 15 years ago committed by Shawn O. Pearce
parent
commit
396fe6da45
  1. 121
      org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
  2. 2
      org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties
  3. 2
      org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java
  4. 12
      org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java
  5. 8
      org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java
  6. 152
      org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java

121
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java

@ -383,6 +383,116 @@ public class RenameDetectorTest extends RepositoryTestCase {
assertSame(b, entries.get(1)); assertSame(b, entries.get(1));
} }
public void testBreakModify_BreakAll() throws Exception {
ObjectId aId = blob("foo");
ObjectId bId = blob("bar");
DiffEntry m = DiffEntry.modify(PATH_A);
m.oldId = AbbreviatedObjectId.fromObjectId(aId);
m.newId = AbbreviatedObjectId.fromObjectId(bId);
DiffEntry a = DiffEntry.add(PATH_B, aId);
rd.add(a);
rd.add(m);
rd.setBreakScore(101);
List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0));
assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1));
}
public void testBreakModify_BreakNone() throws Exception {
ObjectId aId = blob("foo");
ObjectId bId = blob("bar");
DiffEntry m = DiffEntry.modify(PATH_A);
m.oldId = AbbreviatedObjectId.fromObjectId(aId);
m.newId = AbbreviatedObjectId.fromObjectId(bId);
DiffEntry a = DiffEntry.add(PATH_B, aId);
rd.add(a);
rd.add(m);
rd.setBreakScore(-1);
List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
assertSame(m, entries.get(0));
assertSame(a, entries.get(1));
}
public void testBreakModify_BreakBelowScore() throws Exception {
ObjectId aId = blob("foo");
ObjectId bId = blob("bar");
DiffEntry m = DiffEntry.modify(PATH_A);
m.oldId = AbbreviatedObjectId.fromObjectId(aId);
m.newId = AbbreviatedObjectId.fromObjectId(bId);
DiffEntry a = DiffEntry.add(PATH_B, aId);
rd.add(a);
rd.add(m);
rd.setBreakScore(20); // Should break the modify
List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0));
assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1));
}
public void testBreakModify_DontBreakAboveScore() throws Exception {
ObjectId aId = blob("blah\nblah\nfoo");
ObjectId bId = blob("blah\nblah\nbar");
DiffEntry m = DiffEntry.modify(PATH_A);
m.oldId = AbbreviatedObjectId.fromObjectId(aId);
m.newId = AbbreviatedObjectId.fromObjectId(bId);
DiffEntry a = DiffEntry.add(PATH_B, aId);
rd.add(a);
rd.add(m);
rd.setBreakScore(20); // Should not break the modify
List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
assertSame(m, entries.get(0));
assertSame(a, entries.get(1));
}
public void testBreakModify_RejoinIfUnpaired() throws Exception {
ObjectId aId = blob("foo");
ObjectId bId = blob("bar");
DiffEntry m = DiffEntry.modify(PATH_A);
m.oldId = AbbreviatedObjectId.fromObjectId(aId);
m.newId = AbbreviatedObjectId.fromObjectId(bId);
rd.add(m);
rd.setBreakScore(101); // Ensure m is broken apart
List<DiffEntry> entries = rd.compute();
assertEquals(1, entries.size());
DiffEntry modify = entries.get(0);
assertEquals(m.oldName, modify.oldName);
assertEquals(m.oldId, modify.oldId);
assertEquals(m.oldMode, modify.oldMode);
assertEquals(m.newName, modify.newName);
assertEquals(m.newId, modify.newId);
assertEquals(m.newMode, modify.newMode);
assertEquals(m.changeType, modify.changeType);
assertEquals(0, modify.score);
}
public void testSetRenameScore_IllegalArgs() throws Exception { public void testSetRenameScore_IllegalArgs() throws Exception {
try { try {
rd.setRenameScore(-1); rd.setRenameScore(-1);
@ -462,4 +572,15 @@ public class RenameDetectorTest extends RepositoryTestCase {
assertEquals(score, copy.getScore()); assertEquals(score, copy.getScore());
} }
private static void assertAdd(String newName, ObjectId newId,
FileMode newMode, DiffEntry add) {
assertEquals(DiffEntry.DEV_NULL, add.oldName);
assertEquals(DiffEntry.A_ZERO, add.oldId);
assertEquals(FileMode.MISSING, add.oldMode);
assertEquals(ChangeType.ADD, add.changeType);
assertEquals(newName, add.newName);
assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId);
assertEquals(newMode, add.newMode);
}
} }

2
org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties

@ -302,8 +302,10 @@ remoteDoesNotSupportSmartHTTPPush=remote does not support smart HTTP push
remoteHungUpUnexpectedly=remote hung up unexpectedly remoteHungUpUnexpectedly=remote hung up unexpectedly
remoteNameCantBeNull=Remote name can't be null. remoteNameCantBeNull=Remote name can't be null.
renamesAlreadyFound=Renames have already been found. renamesAlreadyFound=Renames have already been found.
renamesBreakingModifies=Breaking apart modified file pairs
renamesFindingByContent=Finding renames by content similarity renamesFindingByContent=Finding renames by content similarity
renamesFindingExact=Finding exact renames renamesFindingExact=Finding exact renames
renamesRejoiningModifies=Rejoining modified file pairs
repositoryAlreadyExists=Repository already exists: {0} repositoryAlreadyExists=Repository already exists: {0}
repositoryConfigFileInvalid=Repository config file {0} invalid {1} repositoryConfigFileInvalid=Repository config file {0} invalid {1}
repositoryIsRequired=Repository is required. repositoryIsRequired=Repository is required.

2
org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java

@ -361,8 +361,10 @@ public class JGitText extends TranslationBundle {
/***/ public String remoteHungUpUnexpectedly; /***/ public String remoteHungUpUnexpectedly;
/***/ public String remoteNameCantBeNull; /***/ public String remoteNameCantBeNull;
/***/ public String renamesAlreadyFound; /***/ public String renamesAlreadyFound;
/***/ public String renamesBreakingModifies;
/***/ public String renamesFindingByContent; /***/ public String renamesFindingByContent;
/***/ public String renamesFindingExact; /***/ public String renamesFindingExact;
/***/ public String renamesRejoiningModifies;
/***/ public String repositoryAlreadyExists; /***/ public String repositoryAlreadyExists;
/***/ public String repositoryConfigFileInvalid; /***/ public String repositoryConfigFileInvalid;
/***/ public String repositoryIsRequired; /***/ public String repositoryIsRequired;

12
org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java

@ -57,7 +57,8 @@ import org.eclipse.jgit.treewalk.TreeWalk;
/** A value class representing a change to a file */ /** A value class representing a change to a file */
public class DiffEntry { public class DiffEntry {
private static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId /** Magical SHA1 used for file adds or deletes */
static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId
.fromObjectId(ObjectId.zeroId()); .fromObjectId(ObjectId.zeroId());
/** Magical file name used for file adds or deletes. */ /** Magical file name used for file adds or deletes. */
@ -171,6 +172,15 @@ public class DiffEntry {
return e; return e;
} }
/**
* Breaks apart a DiffEntry into two entries, one DELETE and one ADD.
*
* @param entry
* the DiffEntry to break apart.
* @return a list containing two entries. Calling {@link #getChangeType()}
* on the first entry will return ChangeType.DELETE. Calling it on
* the second entry will return ChangeType.ADD.
*/
static List<DiffEntry> breakModify(DiffEntry entry) { static List<DiffEntry> breakModify(DiffEntry entry) {
DiffEntry del = new DiffEntry(); DiffEntry del = new DiffEntry();
del.oldId = entry.getOldId(); del.oldId = entry.getOldId();

8
org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java

@ -290,6 +290,14 @@ public class DiffFormatter {
o.write('\n'); o.write('\n');
} }
break; break;
case MODIFY:
int score = ent.getScore();
if (0 < score && score <= 100) {
o.write(encodeASCII("dissimilarity index " + (100 - score)
+ "%"));
o.write('\n');
}
break;
} }
switch (ent.getChangeType()) { switch (ent.getChangeType()) {

152
org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java

@ -55,6 +55,7 @@ import java.util.List;
import org.eclipse.jgit.JGitText; import org.eclipse.jgit.JGitText;
import org.eclipse.jgit.diff.DiffEntry.ChangeType; import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.AbbreviatedObjectId; import org.eclipse.jgit.lib.AbbreviatedObjectId;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.NullProgressMonitor; import org.eclipse.jgit.lib.NullProgressMonitor;
import org.eclipse.jgit.lib.ObjectReader; import org.eclipse.jgit.lib.ObjectReader;
@ -99,7 +100,7 @@ public class RenameDetector {
} }
}; };
private final List<DiffEntry> entries = new ArrayList<DiffEntry>(); private List<DiffEntry> entries = new ArrayList<DiffEntry>();
private List<DiffEntry> deleted = new ArrayList<DiffEntry>(); private List<DiffEntry> deleted = new ArrayList<DiffEntry>();
@ -112,6 +113,13 @@ public class RenameDetector {
/** Similarity score required to pair an add/delete as a rename. */ /** Similarity score required to pair an add/delete as a rename. */
private int renameScore = 60; private int renameScore = 60;
/**
* Similarity score required to keep modified file pairs together. Any
* modified file pairs with a similarity score below this will be broken
* apart.
*/
private int breakScore = -1;
/** Limit in the number of files to consider for renames. */ /** Limit in the number of files to consider for renames. */
private int renameLimit; private int renameLimit;
@ -159,6 +167,29 @@ public class RenameDetector {
renameScore = score; renameScore = score;
} }
/**
* @return the similarity score required to keep modified file pairs
* together. Any modify pairs that score below this will be broken
* apart into separate add/deletes. Values less than or equal to
* zero indicate that no modifies will be broken apart. Values over
* 100 cause all modify pairs to be broken.
*/
public int getBreakScore() {
return breakScore;
}
/**
* @param breakScore
* the similarity score required to keep modified file pairs
* together. Any modify pairs that score below this will be
* broken apart into separate add/deletes. Values less than or
* equal to zero indicate that no modifies will be broken apart.
* Values over 100 cause all modify pairs to be broken.
*/
public void setBreakScore(int breakScore) {
this.breakScore = breakScore;
}
/** @return limit on number of paths to perform inexact rename detection. */ /** @return limit on number of paths to perform inexact rename detection. */
public int getRenameLimit() { public int getRenameLimit() {
return renameLimit; return renameLimit;
@ -219,10 +250,13 @@ public class RenameDetector {
break; break;
case MODIFY: case MODIFY:
if (sameType(entry.getOldMode(), entry.getNewMode())) if (sameType(entry.getOldMode(), entry.getNewMode())) {
entries.add(entry); entries.add(entry);
else } else {
entries.addAll(DiffEntry.breakModify(entry)); List<DiffEntry> tmp = DiffEntry.breakModify(entry);
deleted.add(tmp.get(0));
added.add(tmp.get(1));
}
break; break;
case COPY: case COPY:
@ -275,8 +309,15 @@ public class RenameDetector {
if (pm == null) if (pm == null)
pm = NullProgressMonitor.INSTANCE; pm = NullProgressMonitor.INSTANCE;
findExactRenames(pm); ObjectReader reader = repo.newObjectReader();
findContentRenames(pm); try {
breakModifies(reader, pm);
findExactRenames(pm);
findContentRenames(reader, pm);
rejoinModifies(pm);
} finally {
reader.release();
}
entries.addAll(added); entries.addAll(added);
added = null; added = null;
@ -289,25 +330,96 @@ public class RenameDetector {
return Collections.unmodifiableList(entries); return Collections.unmodifiableList(entries);
} }
private void findContentRenames(ProgressMonitor pm) throws IOException { private void breakModifies(ObjectReader reader, ProgressMonitor pm)
throws IOException {
if (breakScore <= 0)
return;
ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());
pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
for (int i = 0; i < entries.size(); i++) {
DiffEntry e = entries.get(i);
if (e.getChangeType() == ChangeType.MODIFY) {
int score = calculateModifyScore(reader, e);
if (score < breakScore) {
List<DiffEntry> tmp = DiffEntry.breakModify(e);
DiffEntry del = tmp.get(0);
del.score = score;
deleted.add(del);
added.add(tmp.get(1));
} else {
newEntries.add(e);
}
} else {
newEntries.add(e);
}
pm.update(1);
}
entries = newEntries;
}
private void rejoinModifies(ProgressMonitor pm) {
HashMap<String, DiffEntry> nameMap = new HashMap<String, DiffEntry>();
ArrayList<DiffEntry> newAdded = new ArrayList<DiffEntry>(added.size());
pm.beginTask(JGitText.get().renamesRejoiningModifies, added.size()
+ deleted.size());
for (DiffEntry src : deleted) {
nameMap.put(src.oldName, src);
pm.update(1);
}
for (DiffEntry dst : added) {
DiffEntry src = nameMap.remove(dst.newName);
if (src != null) {
if (sameType(src.oldMode, dst.newMode)) {
entries.add(DiffEntry.pair(ChangeType.MODIFY, src, dst,
src.score));
} else {
nameMap.put(src.oldName, src);
newAdded.add(dst);
}
} else {
newAdded.add(dst);
}
pm.update(1);
}
added = newAdded;
deleted = new ArrayList<DiffEntry>(nameMap.values());
}
private int calculateModifyScore(ObjectReader reader, DiffEntry d)
throws IOException {
SimilarityIndex src = new SimilarityIndex();
src.hash(reader.open(d.oldId.toObjectId(), Constants.OBJ_BLOB));
src.sort();
SimilarityIndex dst = new SimilarityIndex();
dst.hash(reader.open(d.newId.toObjectId(), Constants.OBJ_BLOB));
dst.sort();
return src.score(dst, 100);
}
private void findContentRenames(ObjectReader reader, ProgressMonitor pm)
throws IOException {
int cnt = Math.max(added.size(), deleted.size()); int cnt = Math.max(added.size(), deleted.size());
if (cnt == 0) if (cnt == 0)
return; return;
if (getRenameLimit() == 0 || cnt <= getRenameLimit()) { if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
ObjectReader reader = repo.newObjectReader(); SimilarityRenameDetector d;
try {
SimilarityRenameDetector d; d = new SimilarityRenameDetector(reader, deleted, added);
d.setRenameScore(getRenameScore());
d = new SimilarityRenameDetector(reader, deleted, added); d.compute(pm);
d.setRenameScore(getRenameScore()); deleted = d.getLeftOverSources();
d.compute(pm); added = d.getLeftOverDestinations();
deleted = d.getLeftOverSources(); entries.addAll(d.getMatches());
added = d.getLeftOverDestinations();
entries.addAll(d.getMatches());
} finally {
reader.release();
}
} else { } else {
overRenameLimit = true; overRenameLimit = true;
} }

Loading…
Cancel
Save