@ -44,6 +44,8 @@
package org.eclipse.jgit.util ;
package org.eclipse.jgit.util ;
import static java.nio.charset.StandardCharsets.ISO_8859_1 ;
import static java.nio.charset.StandardCharsets.UTF_8 ;
import static org.eclipse.jgit.lib.ObjectChecker.author ;
import static org.eclipse.jgit.lib.ObjectChecker.author ;
import static org.eclipse.jgit.lib.ObjectChecker.committer ;
import static org.eclipse.jgit.lib.ObjectChecker.committer ;
import static org.eclipse.jgit.lib.ObjectChecker.encoding ;
import static org.eclipse.jgit.lib.ObjectChecker.encoding ;
@ -60,6 +62,7 @@ import java.util.Arrays;
import java.util.HashMap ;
import java.util.HashMap ;
import java.util.Map ;
import java.util.Map ;
import org.eclipse.jgit.annotations.Nullable ;
import org.eclipse.jgit.lib.Constants ;
import org.eclipse.jgit.lib.Constants ;
import org.eclipse.jgit.lib.PersonIdent ;
import org.eclipse.jgit.lib.PersonIdent ;
@ -70,7 +73,7 @@ public final class RawParseUtils {
*
*
* @since 2 . 2
* @since 2 . 2
* /
* /
public static final Charset UTF8_CHARSET = Charset . forName ( "UTF-8" ) ; //$NON-NLS-1$
public static final Charset UTF8_CHARSET = UTF_8 ;
private static final byte [ ] digits10 ;
private static final byte [ ] digits10 ;
@ -81,9 +84,9 @@ public final class RawParseUtils {
private static final Map < String , Charset > encodingAliases ;
private static final Map < String , Charset > encodingAliases ;
static {
static {
encodingAliases = new HashMap < String , Charset > ( ) ;
encodingAliases = new HashMap < > ( ) ;
encodingAliases . put ( "latin-1" , Charset . forName ( "ISO-8859-1" ) ) ; //$NON-NLS-1$ //$NON-NLS-2 $
encodingAliases . put ( "latin-1" , ISO_8859_1 ) ; //$NON-NLS-1$
encodingAliases . put ( "'utf8'" , Charset . forName ( "UTF-8" ) ) ; //$NON-NLS-1$ //$NON-NLS-2 $
encodingAliases . put ( "iso-latin-1" , ISO_8859_1 ) ; //$NON-NLS-1$
digits10 = new byte [ '9' + 1 ] ;
digits10 = new byte [ '9' + 1 ] ;
Arrays . fill ( digits10 , ( byte ) - 1 ) ;
Arrays . fill ( digits10 , ( byte ) - 1 ) ;
@ -671,6 +674,27 @@ public final class RawParseUtils {
return match ( b , ptr , encoding ) ;
return match ( b , ptr , encoding ) ;
}
}
/ * *
* Parse the "encoding " header as a string .
* < p >
* Locates the "encoding " header ( if present ) and returns its value .
*
* @param b
* buffer to scan .
* @return the encoding header as specified in the commit ; null if the
* header was not present and should be assumed .
* @since 4 . 2
* /
@Nullable
public static String parseEncodingName ( final byte [ ] b ) {
int enc = encoding ( b , 0 ) ;
if ( enc < 0 ) {
return null ;
}
int lf = nextLF ( b , enc ) ;
return decode ( UTF_8 , b , enc , lf - 1 ) ;
}
/ * *
/ * *
* Parse the "encoding " header into a character set reference .
* Parse the "encoding " header into a character set reference .
* < p >
* < p >
@ -678,29 +702,33 @@ public final class RawParseUtils {
* { @link # encoding ( byte [ ] , int ) } and then returns the proper character set
* { @link # encoding ( byte [ ] , int ) } and then returns the proper character set
* to apply to this buffer to evaluate its contents as character data .
* to apply to this buffer to evaluate its contents as character data .
* < p >
* < p >
* If no encoding header is present , { @link Constants # CHARSET } is assumed .
* If no encoding header is present { @code UTF - 8 } is assumed .
*
*
* @param b
* @param b
* buffer to scan .
* buffer to scan .
* @return the Java character set representation . Never null .
* @return the Java character set representation . Never null .
* @throws IllegalCharsetNameException
* if the character set requested by the encoding header is
* malformed and unsupportable .
* @throws UnsupportedCharsetException
* if the JRE does not support the character set requested by
* the encoding header .
* /
* /
public static Charset parseEncoding ( final byte [ ] b ) {
public static Charset parseEncoding ( final byte [ ] b ) {
final int enc = encoding ( b , 0 ) ;
String enc = parseEncodingName ( b ) ;
if ( enc < 0 )
if ( enc = = null ) {
return Constants . CHARSET ;
return UTF_8 ;
final int lf = nextLF ( b , enc ) ;
}
String decoded = decode ( Constants . CHARSET , b , enc , lf - 1 ) ;
String name = enc . trim ( ) ;
try {
try {
return Charset . forName ( decoded ) ;
return Charset . forName ( name ) ;
} catch ( IllegalCharsetNameException badName ) {
} catch ( IllegalCharsetNameException
Charset aliased = charsetForAlias ( decoded ) ;
| UnsupportedCharsetException badName ) {
if ( aliased ! = null )
Charset aliased = charsetForAlias ( name ) ;
return aliased ;
if ( aliased ! = null ) {
throw badName ;
} catch ( UnsupportedCharsetException badName ) {
Charset aliased = charsetForAlias ( decoded ) ;
if ( aliased ! = null )
return aliased ;
return aliased ;
}
throw badName ;
throw badName ;
}
}
}
}
@ -739,7 +767,15 @@ public final class RawParseUtils {
* parsed .
* parsed .
* /
* /
public static PersonIdent parsePersonIdent ( final byte [ ] raw , final int nameB ) {
public static PersonIdent parsePersonIdent ( final byte [ ] raw , final int nameB ) {
final Charset cs = parseEncoding ( raw ) ;
Charset cs ;
try {
cs = parseEncoding ( raw ) ;
} catch ( IllegalCharsetNameException | UnsupportedCharsetException e ) {
// Assume UTF-8 for person identities, usually this is correct.
// If not decode() will fall back to the ISO-8859-1 encoding.
cs = UTF_8 ;
}
final int emailB = nextLF ( raw , nameB , '<' ) ;
final int emailB = nextLF ( raw , nameB , '<' ) ;
final int emailE = nextLF ( raw , emailB , '>' ) ;
final int emailE = nextLF ( raw , emailB , '>' ) ;
if ( emailB > = raw . length | | raw [ emailB ] = = '\n' | |
if ( emailB > = raw . length | | raw [ emailB ] = = '\n' | |
@ -887,7 +923,7 @@ public final class RawParseUtils {
* /
* /
public static String decode ( final byte [ ] buffer , final int start ,
public static String decode ( final byte [ ] buffer , final int start ,
final int end ) {
final int end ) {
return decode ( Constants . CHARSET , buffer , start , end ) ;
return decode ( UTF_8 , buffer , start , end ) ;
}
}
/ * *
/ * *
@ -961,23 +997,21 @@ public final class RawParseUtils {
public static String decodeNoFallback ( final Charset cs ,
public static String decodeNoFallback ( final Charset cs ,
final byte [ ] buffer , final int start , final int end )
final byte [ ] buffer , final int start , final int end )
throws CharacterCodingException {
throws CharacterCodingException {
final ByteBuffer b = ByteBuffer . wrap ( buffer , start , end - start ) ;
ByteBuffer b = ByteBuffer . wrap ( buffer , start , end - start ) ;
b . mark ( ) ;
b . mark ( ) ;
// Try our built-in favorite. The assumption here is that
// Try our built-in favorite. The assumption here is that
// decoding will fail if the data is not actually encoded
// decoding will fail if the data is not actually encoded
// using that encoder.
// using that encoder.
//
try {
try {
return decode ( b , Constants . CHARSET ) ;
return decode ( b , UTF_8 ) ;
} catch ( CharacterCodingException e ) {
} catch ( CharacterCodingException e ) {
b . reset ( ) ;
b . reset ( ) ;
}
}
if ( ! cs . equals ( Constants . CHARSET ) ) {
if ( ! cs . equals ( UTF_8 ) ) {
// Try the suggested encoding, it might be right since it was
// Try the suggested encoding, it might be right since it was
// provided by the caller.
// provided by the caller.
//
try {
try {
return decode ( b , cs ) ;
return decode ( b , cs ) ;
} catch ( CharacterCodingException e ) {
} catch ( CharacterCodingException e ) {
@ -987,9 +1021,8 @@ public final class RawParseUtils {
// Try the default character set. A small group of people
// Try the default character set. A small group of people
// might actually use the same (or very similar) locale.
// might actually use the same (or very similar) locale.
//
Charset defcs = Charset . defaultCharset ( ) ;
final Charset defcs = Charset . defaultCharset ( ) ;
if ( ! defcs . equals ( cs ) & & ! defcs . equals ( UTF_8 ) ) {
if ( ! defcs . equals ( cs ) & & ! defcs . equals ( Constants . CHARSET ) ) {
try {
try {
return decode ( b , defcs ) ;
return decode ( b , defcs ) ;
} catch ( CharacterCodingException e ) {
} catch ( CharacterCodingException e ) {