RevCommit: Better support invalid encoding headers

With this support we no longer need the 'utf-8' alias. UTF-8 will be automatically tried when the encoding header is not recognized and used if the character sequence cleanly decodes as UTF-8. Modernize some of the references to use StandardCharsets. Change-Id: I4c0c88750475560e1f2263180c4a98eb8febeca0
9 years ago · 31d92ace5b
5 changed files with 263 additions and 56 deletions
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java
@ -43,13 +43,18 @@

 package org.eclipse.jgit.revwalk;

+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;

 import java.io.ByteArrayOutputStream;
 import java.io.UnsupportedEncodingException;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.TimeZone;

 import org.eclipse.jgit.junit.RepositoryTestCase;
@ -303,6 +308,86 @@ public class RevCommitParseTest extends RepositoryTestCase {
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
 	}

+	@Test
+	public void testParse_incorrectUtf8Name() throws Exception {
+		ByteArrayOutputStream b = new ByteArrayOutputStream();
+		b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n"
+				.getBytes(UTF_8));
+		b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
+		b.write("committer co <c@example.com> 1218123390 -0500\n"
+				.getBytes(UTF_8));
+		b.write("encoding 'utf8'\n".getBytes(UTF_8));
+		b.write("\n".getBytes(UTF_8));
+		b.write("Sm\u00f6rg\u00e5sbord\n".getBytes(UTF_8));
+
+		RevCommit c = new RevCommit(
+				id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
+		c.parseCanonical(new RevWalk(db), b.toByteArray());
+		assertEquals("'utf8'", c.getEncodingName());
+		assertEquals("Sm\u00f6rg\u00e5sbord\n", c.getFullMessage());
+
+		try {
+			c.getEncoding();
+			fail("Expected " + IllegalCharsetNameException.class);
+		} catch (IllegalCharsetNameException badName) {
+			assertEquals("'utf8'", badName.getMessage());
+		}
+	}
+
+	@Test
+	public void testParse_illegalEncoding() throws Exception {
+		ByteArrayOutputStream b = new ByteArrayOutputStream();
+		b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
+		b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
+		b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8));
+		b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8));
+		b.write("\n".getBytes(UTF_8));
+		b.write("message\n".getBytes(UTF_8));
+
+		RevCommit c = new RevCommit(
+				id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
+		c.parseCanonical(new RevWalk(db), b.toByteArray());
+		assertEquals("utf-8logoutputencoding=gbk", c.getEncodingName());
+		assertEquals("message\n", c.getFullMessage());
+		assertEquals("message", c.getShortMessage());
+		assertTrue(c.getFooterLines().isEmpty());
+		assertEquals("au", c.getAuthorIdent().getName());
+
+		try {
+			c.getEncoding();
+			fail("Expected " + IllegalCharsetNameException.class);
+		} catch (IllegalCharsetNameException badName) {
+			assertEquals("utf-8logoutputencoding=gbk", badName.getMessage());
+		}
+	}
+
+	@Test
+	public void testParse_unsupportedEncoding() throws Exception {
+		ByteArrayOutputStream b = new ByteArrayOutputStream();
+		b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
+		b.write("author au <a@example.com> 1218123387 +0700\n".getBytes(UTF_8));
+		b.write("committer co <c@example.com> 1218123390 -0500\n".getBytes(UTF_8));
+		b.write("encoding it_IT.UTF8\n".getBytes(UTF_8));
+		b.write("\n".getBytes(UTF_8));
+		b.write("message\n".getBytes(UTF_8));
+
+		RevCommit c = new RevCommit(
+				id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
+		c.parseCanonical(new RevWalk(db), b.toByteArray());
+		assertEquals("it_IT.UTF8", c.getEncodingName());
+		assertEquals("message\n", c.getFullMessage());
+		assertEquals("message", c.getShortMessage());
+		assertTrue(c.getFooterLines().isEmpty());
+		assertEquals("au", c.getAuthorIdent().getName());
+
+		try {
+			c.getEncoding();
+			fail("Expected " + UnsupportedCharsetException.class);
+		} catch (UnsupportedCharsetException badName) {
+			assertEquals("it_IT.UTF8", badName.getMessage());
+		}
+	}
+
 	@Test
 	public void testParse_NoMessage() throws Exception {
 		final String msg = "";
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java
@ -43,6 +43,7 @@

 package org.eclipse.jgit.revwalk;

+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
@ -361,6 +362,44 @@ public class RevTagParseTest extends RepositoryTestCase {
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
 	}

+	@Test
+	public void testParse_illegalEncoding() throws Exception {
+		ByteArrayOutputStream b = new ByteArrayOutputStream();
+		b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
+		b.write("type tree\n".getBytes(UTF_8));
+		b.write("tag v1.0\n".getBytes(UTF_8));
+		b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8));
+		b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8));
+		b.write("\n".getBytes(UTF_8));
+		b.write("message\n".getBytes(UTF_8));
+
+		RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
+		t.parseCanonical(new RevWalk(db), b.toByteArray());
+
+		assertEquals("t", t.getTaggerIdent().getName());
+		assertEquals("message", t.getShortMessage());
+		assertEquals("message\n", t.getFullMessage());
+	}
+
+	@Test
+	public void testParse_unsupportedEncoding() throws Exception {
+		ByteArrayOutputStream b = new ByteArrayOutputStream();
+		b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8));
+		b.write("type tree\n".getBytes(UTF_8));
+		b.write("tag v1.0\n".getBytes(UTF_8));
+		b.write("tagger t <t@example.com> 1218123387 +0700\n".getBytes(UTF_8));
+		b.write("encoding it_IT.UTF8\n".getBytes(UTF_8));
+		b.write("\n".getBytes(UTF_8));
+		b.write("message\n".getBytes(UTF_8));
+
+		RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67"));
+		t.parseCanonical(new RevWalk(db), b.toByteArray());
+
+		assertEquals("t", t.getTaggerIdent().getName());
+		assertEquals("message", t.getShortMessage());
+		assertEquals("message\n", t.getFullMessage());
+	}
+
 	@Test
 	public void testParse_NoMessage() throws Exception {
 		final String msg = "";
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java
@ -44,12 +44,17 @@

 package org.eclipse.jgit.revwalk;

+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.IOException;
 import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;

+import org.eclipse.jgit.annotations.Nullable;
 import org.eclipse.jgit.errors.IncorrectObjectTypeException;
 import org.eclipse.jgit.errors.MissingObjectException;
 import org.eclipse.jgit.lib.AnyObjectId;
@ -441,12 +446,12 @@ public class RevCommit extends RevObject {
 	 * @return decoded commit message as a string. Never null.
 	 */
 	public final String getFullMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.commitMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.commitMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		return RawParseUtils.decode(enc, raw, msgB, raw.length);
+		}
+		return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
 	}

 	/**
@ -465,16 +470,17 @@ public class RevCommit extends RevObject {
 	 *         spanned multiple lines. Embedded LFs are converted to spaces.
 	 */
 	public final String getShortMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.commitMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.commitMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
+		}

-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
-		String str = RawParseUtils.decode(enc, raw, msgB, msgE);
-		if (hasLF(raw, msgB, msgE))
+		int msgE = RawParseUtils.endOfParagraph(raw, msgB);
+		String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
+		if (hasLF(raw, msgB, msgE)) {
 			str = StringUtils.replaceLineBreaksWithSpace(str);
+		}
 		return str;
 	}

@ -485,6 +491,23 @@ public class RevCommit extends RevObject {
 		return false;
 	}

+	/**
+	 * Determine the encoding of the commit message buffer.
+	 * <p>
+	 * Locates the "encoding" header (if present) and returns its value. Due to
+	 * corruption in the wild this may be an invalid encoding name that is not
+	 * recognized by any character encoding library.
+	 * <p>
+	 * If no encoding header is present, null.
+	 *
+	 * @return the preferred encoding of {@link #getRawBuffer()}; or null.
+	 * @since 4.2
+	 */
+	@Nullable
+	public final String getEncodingName() {
+		return RawParseUtils.parseEncodingName(buffer);
+	}
+
 	/**
 	 * Determine the encoding of the commit message buffer.
 	 * <p>
@ -492,14 +515,28 @@ public class RevCommit extends RevObject {
 	 * character set to apply to this buffer to evaluate its contents as
 	 * character data.
 	 * <p>
-	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+	 * If no encoding header is present {@code UTF-8} is assumed.
 	 *
 	 * @return the preferred encoding of {@link #getRawBuffer()}.
+	 * @throws IllegalCharsetNameException
+	 *             if the character set requested by the encoding header is
+	 *             malformed and unsupportable.
+	 * @throws UnsupportedCharsetException
+	 *             if the JRE does not support the character set requested by
+	 *             the encoding header.
 	 */
 	public final Charset getEncoding() {
 		return RawParseUtils.parseEncoding(buffer);
 	}

+	private Charset guessEncoding() {
+		try {
+			return getEncoding();
+		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+			return UTF_8;
+		}
+	}
+
 	/**
 	 * Parse the footer lines (e.g. "Signed-off-by") for machine processing.
 	 * <p>
@ -529,7 +566,7 @@ public class RevCommit extends RevObject {

 		final int msgB = RawParseUtils.commitMessage(raw, 0);
 		final ArrayList<FooterLine> r = new ArrayList<FooterLine>(4);
-		final Charset enc = getEncoding();
+		final Charset enc = guessEncoding();
 		for (;;) {
 			ptr = RawParseUtils.prevLF(raw, ptr);
 			if (ptr <= msgB)
--- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java
@ -45,8 +45,12 @@

 package org.eclipse.jgit.revwalk;

+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.IOException;
 import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;

 import org.eclipse.jgit.errors.CorruptObjectException;
 import org.eclipse.jgit.errors.IncorrectObjectTypeException;
@ -162,7 +166,7 @@ public class RevTag extends RevObject {

 		int p = pos.value += 4; // "tag "
 		final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1;
-		tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd);
+		tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd);

 		if (walk.isRetainBody())
 			buffer = rawTag;
@ -207,12 +211,12 @@ public class RevTag extends RevObject {
 	 * @return decoded tag message as a string. Never null.
 	 */
 	public final String getFullMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.tagMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.tagMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		return RawParseUtils.decode(enc, raw, msgB, raw.length);
+		}
+		return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length);
 	}

 	/**
@ -231,19 +235,28 @@ public class RevTag extends RevObject {
 	 *         multiple lines. Embedded LFs are converted to spaces.
 	 */
 	public final String getShortMessage() {
-		final byte[] raw = buffer;
-		final int msgB = RawParseUtils.tagMessage(raw, 0);
-		if (msgB < 0)
+		byte[] raw = buffer;
+		int msgB = RawParseUtils.tagMessage(raw, 0);
+		if (msgB < 0) {
 			return ""; //$NON-NLS-1$
+		}

-		final Charset enc = RawParseUtils.parseEncoding(raw);
-		final int msgE = RawParseUtils.endOfParagraph(raw, msgB);
-		String str = RawParseUtils.decode(enc, raw, msgB, msgE);
-		if (RevCommit.hasLF(raw, msgB, msgE))
+		int msgE = RawParseUtils.endOfParagraph(raw, msgB);
+		String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE);
+		if (RevCommit.hasLF(raw, msgB, msgE)) {
 			str = StringUtils.replaceLineBreaksWithSpace(str);
+		}
 		return str;
 	}

+	private Charset guessEncoding() {
+		try {
+			return RawParseUtils.parseEncoding(buffer);
+		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+			return UTF_8;
+		}
+	}
+
 	/**
 	 * Get a reference to the object this tag was placed on.
 	 * <p>
--- a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java
@ -44,6 +44,8 @@

 package org.eclipse.jgit.util;

+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.eclipse.jgit.lib.ObjectChecker.author;
 import static org.eclipse.jgit.lib.ObjectChecker.committer;
 import static org.eclipse.jgit.lib.ObjectChecker.encoding;
@ -60,6 +62,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;

+import org.eclipse.jgit.annotations.Nullable;
 import org.eclipse.jgit.lib.Constants;
 import org.eclipse.jgit.lib.PersonIdent;

@ -70,7 +73,7 @@ public final class RawParseUtils {
 	 *
 	 * @since 2.2
 	 */
-	public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
+	public static final Charset UTF8_CHARSET = UTF_8;

 	private static final byte[] digits10;

@ -81,9 +84,9 @@ public final class RawParseUtils {
 	private static final Map<String, Charset> encodingAliases;

 	static {
-		encodingAliases = new HashMap<String, Charset>();
-		encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
-		encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$
+		encodingAliases = new HashMap<>();
+		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
+		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$

 		digits10 = new byte['9' + 1];
 		Arrays.fill(digits10, (byte) -1);
@ -671,6 +674,27 @@ public final class RawParseUtils {
 		return match(b, ptr, encoding);
 	}

+	/**
+	 * Parse the "encoding " header as a string.
+	 * <p>
+	 * Locates the "encoding " header (if present) and returns its value.
+	 *
+	 * @param b
+	 *            buffer to scan.
+	 * @return the encoding header as specified in the commit; null if the
+	 *         header was not present and should be assumed.
+	 * @since 4.2
+	 */
+	@Nullable
+	public static String parseEncodingName(final byte[] b) {
+		int enc = encoding(b, 0);
+		if (enc < 0) {
+			return null;
+		}
+		int lf = nextLF(b, enc);
+		return decode(UTF_8, b, enc, lf - 1);
+	}
+
 	/**
 	 * Parse the "encoding " header into a character set reference.
 	 * <p>
@ -678,29 +702,33 @@ public final class RawParseUtils {
 	 * {@link #encoding(byte[], int)} and then returns the proper character set
 	 * to apply to this buffer to evaluate its contents as character data.
 	 * <p>
-	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+	 * If no encoding header is present {@code UTF-8} is assumed.
 	 *
 	 * @param b
 	 *            buffer to scan.
 	 * @return the Java character set representation. Never null.
+	 * @throws IllegalCharsetNameException
+	 *             if the character set requested by the encoding header is
+	 *             malformed and unsupportable.
+	 * @throws UnsupportedCharsetException
+	 *             if the JRE does not support the character set requested by
+	 *             the encoding header.
 	 */
 	public static Charset parseEncoding(final byte[] b) {
-		final int enc = encoding(b, 0);
-		if (enc < 0)
-			return Constants.CHARSET;
-		final int lf = nextLF(b, enc);
-		String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
+		String enc = parseEncodingName(b);
+		if (enc == null) {
+			return UTF_8;
+		}
+
+		String name = enc.trim();
 		try {
-			return Charset.forName(decoded);
-		} catch (IllegalCharsetNameException badName) {
-			Charset aliased = charsetForAlias(decoded);
-			if (aliased != null)
-				return aliased;
-			throw badName;
-		} catch (UnsupportedCharsetException badName) {
-			Charset aliased = charsetForAlias(decoded);
-			if (aliased != null)
+			return Charset.forName(name);
+		} catch (IllegalCharsetNameException
+				| UnsupportedCharsetException badName) {
+			Charset aliased = charsetForAlias(name);
+			if (aliased != null) {
 				return aliased;
+			}
 			throw badName;
 		}
 	}
@ -739,7 +767,15 @@ public final class RawParseUtils {
 	 *         parsed.
 	 */
 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
-		final Charset cs = parseEncoding(raw);
+		Charset cs;
+		try {
+			cs = parseEncoding(raw);
+		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+			// Assume UTF-8 for person identities, usually this is correct.
+			// If not decode() will fall back to the ISO-8859-1 encoding.
+			cs = UTF_8;
+		}
+
 		final int emailB = nextLF(raw, nameB, '<');
 		final int emailE = nextLF(raw, emailB, '>');
 		if (emailB >= raw.length || raw[emailB] == '\n' ||
@ -887,7 +923,7 @@ public final class RawParseUtils {
 	 */
 	public static String decode(final byte[] buffer, final int start,
 			final int end) {
-		return decode(Constants.CHARSET, buffer, start, end);
+		return decode(UTF_8, buffer, start, end);
 	}

 	/**
@ -961,23 +997,21 @@ public final class RawParseUtils {
 	public static String decodeNoFallback(final Charset cs,
 			final byte[] buffer, final int start, final int end)
 			throws CharacterCodingException {
-		final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
+		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 		b.mark();

 		// Try our built-in favorite. The assumption here is that
 		// decoding will fail if the data is not actually encoded
 		// using that encoder.
-		//
 		try {
-			return decode(b, Constants.CHARSET);
+			return decode(b, UTF_8);
 		} catch (CharacterCodingException e) {
 			b.reset();
 		}

-		if (!cs.equals(Constants.CHARSET)) {
+		if (!cs.equals(UTF_8)) {
 			// Try the suggested encoding, it might be right since it was
 			// provided by the caller.
-			//
 			try {
 				return decode(b, cs);
 			} catch (CharacterCodingException e) {
@ -987,9 +1021,8 @@ public final class RawParseUtils {

 		// Try the default character set. A small group of people
 		// might actually use the same (or very similar) locale.
-		//
-		final Charset defcs = Charset.defaultCharset();
-		if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
+		Charset defcs = Charset.defaultCharset();
+		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
 			try {
 				return decode(b, defcs);
 			} catch (CharacterCodingException e) {