@ -65,8 +65,8 @@ import org.eclipse.jgit.lib.ObjectStream;
* file are discovered .
* file are discovered .
* /
* /
class SimilarityIndex {
class SimilarityIndex {
/** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS} . */
/** A special {@link TableFullException} used in place of OutOfMemoryError . */
private static final int MAX_HASH_BITS = 17 ;
private static final TableFullException TABLE_FULL_OUT_OF_MEMORY = new TableFullException ( ) ;
/ * *
/ * *
* Shift to apply before storing a key .
* Shift to apply before storing a key .
@ -76,20 +76,26 @@ class SimilarityIndex {
* /
* /
private static final int KEY_SHIFT = 32 ;
private static final int KEY_SHIFT = 32 ;
/** Maximum value of the count field, also mask to extract the count. */
private static final long MAX_COUNT = ( 1L < < KEY_SHIFT ) - 1 ;
/** Total size of the file we hashed into the structure. */
/** Total size of the file we hashed into the structure. */
private long fileSize ;
private long fileSize ;
/** Number of non-zero entries in {@link #idHash}. */
/** Number of non-zero entries in {@link #idHash}. */
private int idSize ;
private int idSize ;
/** {@link #idSize} that triggers {@link #idHash} to double in size. */
private int idGrowAt ;
/ * *
/ * *
* Pairings of content keys and counters .
* Pairings of content keys and counters .
* < p >
* < p >
* Slots in the table are actually two ints wedged into a single long . The
* Slots in the table are actually two ints wedged into a single long . The
* upper { @link # MAX_HASH_BITS } bits stores the content key , and the
* upper 32 bits stores the content key , and the remaining lower bits stores
* remaining lower bits stores the number of bytes associated with that key .
* the number of bytes associated with that key . Empty slots are denoted by
* Empty slots are denoted by 0 , which cannot occur because the count cannot
* 0 , which cannot occur because the count cannot be 0 . Values can only be
* be 0 . Values can only be positive , which we enforce during key addition .
* positive , which we enforce during key addition .
* /
* /
private long [ ] idHash ;
private long [ ] idHash ;
@ -99,6 +105,7 @@ class SimilarityIndex {
SimilarityIndex ( ) {
SimilarityIndex ( ) {
idHashBits = 8 ;
idHashBits = 8 ;
idHash = new long [ 1 < < idHashBits ] ;
idHash = new long [ 1 < < idHashBits ] ;
idGrowAt = growAt ( idHashBits ) ;
}
}
long getFileSize ( ) {
long getFileSize ( ) {
@ -109,7 +116,8 @@ class SimilarityIndex {
fileSize = size ;
fileSize = size ;
}
}
void hash ( ObjectLoader obj ) throws MissingObjectException , IOException {
void hash ( ObjectLoader obj ) throws MissingObjectException , IOException ,
TableFullException {
if ( obj . isLarge ( ) ) {
if ( obj . isLarge ( ) ) {
ObjectStream in = obj . openStream ( ) ;
ObjectStream in = obj . openStream ( ) ;
try {
try {
@ -125,7 +133,7 @@ class SimilarityIndex {
}
}
}
}
void hash ( byte [ ] raw , int ptr , final int end ) {
void hash ( byte [ ] raw , int ptr , final int end ) throws TableFullException {
while ( ptr < end ) {
while ( ptr < end ) {
int hash = 5381 ;
int hash = 5381 ;
int start = ptr ;
int start = ptr ;
@ -141,7 +149,8 @@ class SimilarityIndex {
}
}
}
}
void hash ( InputStream in , long remaining ) throws IOException {
void hash ( InputStream in , long remaining ) throws IOException ,
TableFullException {
byte [ ] buf = new byte [ 4096 ] ;
byte [ ] buf = new byte [ 4096 ] ;
int ptr = 0 ;
int ptr = 0 ;
int cnt = 0 ;
int cnt = 0 ;
@ -190,11 +199,11 @@ class SimilarityIndex {
return ( int ) ( ( common ( dst ) * maxScore ) / max ) ;
return ( int ) ( ( common ( dst ) * maxScore ) / max ) ;
}
}
int common ( SimilarityIndex dst ) {
long common ( SimilarityIndex dst ) {
return common ( this , dst ) ;
return common ( this , dst ) ;
}
}
private static int common ( SimilarityIndex src , SimilarityIndex dst ) {
private static long common ( SimilarityIndex src , SimilarityIndex dst ) {
int srcIdx = src . packedIndex ( 0 ) ;
int srcIdx = src . packedIndex ( 0 ) ;
int dstIdx = dst . packedIndex ( 0 ) ;
int dstIdx = dst . packedIndex ( 0 ) ;
long [ ] srcHash = src . idHash ;
long [ ] srcHash = src . idHash ;
@ -202,12 +211,12 @@ class SimilarityIndex {
return common ( srcHash , srcIdx , dstHash , dstIdx ) ;
return common ( srcHash , srcIdx , dstHash , dstIdx ) ;
}
}
private static int common ( long [ ] srcHash , int srcIdx , //
private static long common ( long [ ] srcHash , int srcIdx , //
long [ ] dstHash , int dstIdx ) {
long [ ] dstHash , int dstIdx ) {
if ( srcIdx = = srcHash . length | | dstIdx = = dstHash . length )
if ( srcIdx = = srcHash . length | | dstIdx = = dstHash . length )
return 0 ;
return 0 ;
int common = 0 ;
long common = 0 ;
int srcKey = keyOf ( srcHash [ srcIdx ] ) ;
int srcKey = keyOf ( srcHash [ srcIdx ] ) ;
int dstKey = keyOf ( dstHash [ dstIdx ] ) ;
int dstKey = keyOf ( dstHash [ dstIdx ] ) ;
@ -230,8 +239,8 @@ class SimilarityIndex {
break ;
break ;
srcKey = keyOf ( srcHash [ srcIdx ] ) ;
srcKey = keyOf ( srcHash [ srcIdx ] ) ;
} else /* if (srcKey > dst Key) */ {
} else /* if (dstKey < src Key) */ {
// Regions of dst which do not appear in dst .
// Regions of dst which do not appear in src .
if ( + + dstIdx = = dstHash . length )
if ( + + dstIdx = = dstHash . length )
break ;
break ;
dstKey = keyOf ( dstHash [ dstIdx ] ) ;
dstKey = keyOf ( dstHash [ dstIdx ] ) ;
@ -268,7 +277,7 @@ class SimilarityIndex {
return ( idHash . length - idSize ) + idx ;
return ( idHash . length - idSize ) + idx ;
}
}
void add ( int key , int cnt ) {
void add ( int key , int cnt ) throws TableFullException {
key = ( key * 0x9e370001 ) > > > 1 ; // Mix bits and ensure not negative.
key = ( key * 0x9e370001 ) > > > 1 ; // Mix bits and ensure not negative.
int j = slot ( key ) ;
int j = slot ( key ) ;
@ -276,18 +285,20 @@ class SimilarityIndex {
long v = idHash [ j ] ;
long v = idHash [ j ] ;
if ( v = = 0 ) {
if ( v = = 0 ) {
// Empty slot in the table, store here.
// Empty slot in the table, store here.
if ( shouldGrow ( ) ) {
if ( idGrowAt < = idSize ) {
grow ( ) ;
grow ( ) ;
j = slot ( key ) ;
j = slot ( key ) ;
continue ;
continue ;
}
}
idHash [ j ] = ( ( ( long ) key ) < < KEY_SHIFT ) | cnt ;
idHash [ j ] = pair ( key , cnt ) ;
idSize + + ;
idSize + + ;
return ;
return ;
} else if ( keyOf ( v ) = = key ) {
} else if ( keyOf ( v ) = = key ) {
// Same key, increment the counter.
// Same key, increment the counter. If it overflows, fail
idHash [ j ] = v + cnt ;
// indexing to prevent the key from being impacted.
//
idHash [ j ] = pair ( key , countOf ( v ) + cnt ) ;
return ;
return ;
} else if ( + + j > = idHash . length ) {
} else if ( + + j > = idHash . length ) {
@ -296,6 +307,12 @@ class SimilarityIndex {
}
}
}
}
private static long pair ( int key , long cnt ) throws TableFullException {
if ( MAX_COUNT < cnt )
throw new TableFullException ( ) ;
return ( ( ( long ) key ) < < KEY_SHIFT ) | cnt ;
}
private int slot ( int key ) {
private int slot ( int key ) {
// We use 31 - idHashBits because the upper bit was already forced
// We use 31 - idHashBits because the upper bit was already forced
// to be 0 and we want the remaining high bits to be used as the
// to be 0 and we want the remaining high bits to be used as the
@ -304,16 +321,26 @@ class SimilarityIndex {
return key > > > ( 31 - idHashBits ) ;
return key > > > ( 31 - idHashBits ) ;
}
}
private boolean shouldGrow ( ) {
private static int growAt ( int idHashBits ) {
return idHashBits < MAX_HASH_BITS & & idHash . length < = idSize * 2 ;
return ( 1 < < idHashBits ) * ( idHashBits - 3 ) / idHashBits ;
}
}
private void grow ( ) {
private void grow ( ) throws TableFullException {
if ( idHashBits = = 30 )
throw new TableFullException ( ) ;
long [ ] oldHash = idHash ;
long [ ] oldHash = idHash ;
int oldSize = idHash . length ;
int oldSize = idHash . length ;
idHashBits + + ;
idHashBits + + ;
idGrowAt = growAt ( idHashBits ) ;
try {
idHash = new long [ 1 < < idHashBits ] ;
idHash = new long [ 1 < < idHashBits ] ;
} catch ( OutOfMemoryError noMemory ) {
throw TABLE_FULL_OUT_OF_MEMORY ;
}
for ( int i = 0 ; i < oldSize ; i + + ) {
for ( int i = 0 ; i < oldSize ; i + + ) {
long v = oldHash [ i ] ;
long v = oldHash [ i ] ;
if ( v ! = 0 ) {
if ( v ! = 0 ) {
@ -330,7 +357,11 @@ class SimilarityIndex {
return ( int ) ( v > > > KEY_SHIFT ) ;
return ( int ) ( v > > > KEY_SHIFT ) ;
}
}
private static int countOf ( long v ) {
private static long countOf ( long v ) {
return ( int ) v ;
return v & MAX_COUNT ;
}
static class TableFullException extends Exception {
private static final long serialVersionUID = 1L ;
}
}
}
}