From 31d92ace5b0b4ca66b3fa191b8a1207d6e8fecc6 Mon Sep 17 00:00:00 2001 From: Shawn Pearce Date: Mon, 11 Jan 2016 12:30:35 -0800 Subject: [PATCH] RevCommit: Better support invalid encoding headers With this support we no longer need the 'utf-8' alias. UTF-8 will be automatically tried when the encoding header is not recognized and used if the character sequence cleanly decodes as UTF-8. Modernize some of the references to use StandardCharsets. Change-Id: I4c0c88750475560e1f2263180c4a98eb8febeca0 --- .../jgit/revwalk/RevCommitParseTest.java | 85 +++++++++++++++++ .../eclipse/jgit/revwalk/RevTagParseTest.java | 39 ++++++++ .../org/eclipse/jgit/revwalk/RevCommit.java | 65 ++++++++++--- .../src/org/eclipse/jgit/revwalk/RevTag.java | 39 +++++--- .../org/eclipse/jgit/util/RawParseUtils.java | 91 +++++++++++++------ 5 files changed, 263 insertions(+), 56 deletions(-) diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java index beda2a7b9..885c1b5b2 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevCommitParseTest.java @@ -43,13 +43,18 @@ package org.eclipse.jgit.revwalk; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.ByteArrayOutputStream; import java.io.UnsupportedEncodingException; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import java.util.TimeZone; import org.eclipse.jgit.junit.RepositoryTestCase; @@ -303,6 +308,86 @@ public class RevCommitParseTest extends RepositoryTestCase { assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); } + @Test + public void testParse_incorrectUtf8Name() throws Exception { + ByteArrayOutputStream b = new ByteArrayOutputStream(); + b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n" + .getBytes(UTF_8)); + b.write("author au 1218123387 +0700\n".getBytes(UTF_8)); + b.write("committer co 1218123390 -0500\n" + .getBytes(UTF_8)); + b.write("encoding 'utf8'\n".getBytes(UTF_8)); + b.write("\n".getBytes(UTF_8)); + b.write("Sm\u00f6rg\u00e5sbord\n".getBytes(UTF_8)); + + RevCommit c = new RevCommit( + id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); + c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertEquals("'utf8'", c.getEncodingName()); + assertEquals("Sm\u00f6rg\u00e5sbord\n", c.getFullMessage()); + + try { + c.getEncoding(); + fail("Expected " + IllegalCharsetNameException.class); + } catch (IllegalCharsetNameException badName) { + assertEquals("'utf8'", badName.getMessage()); + } + } + + @Test + public void testParse_illegalEncoding() throws Exception { + ByteArrayOutputStream b = new ByteArrayOutputStream(); + b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); + b.write("author au 1218123387 +0700\n".getBytes(UTF_8)); + b.write("committer co 1218123390 -0500\n".getBytes(UTF_8)); + b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8)); + b.write("\n".getBytes(UTF_8)); + b.write("message\n".getBytes(UTF_8)); + + RevCommit c = new RevCommit( + id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); + c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertEquals("utf-8logoutputencoding=gbk", c.getEncodingName()); + assertEquals("message\n", c.getFullMessage()); + assertEquals("message", c.getShortMessage()); + assertTrue(c.getFooterLines().isEmpty()); + assertEquals("au", c.getAuthorIdent().getName()); + + try { + c.getEncoding(); + fail("Expected " + IllegalCharsetNameException.class); + } catch (IllegalCharsetNameException badName) { + assertEquals("utf-8logoutputencoding=gbk", badName.getMessage()); + } + } + + @Test + public void testParse_unsupportedEncoding() throws Exception { + ByteArrayOutputStream b = new ByteArrayOutputStream(); + b.write("tree 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); + b.write("author au 1218123387 +0700\n".getBytes(UTF_8)); + b.write("committer co 1218123390 -0500\n".getBytes(UTF_8)); + b.write("encoding it_IT.UTF8\n".getBytes(UTF_8)); + b.write("\n".getBytes(UTF_8)); + b.write("message\n".getBytes(UTF_8)); + + RevCommit c = new RevCommit( + id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); + c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertEquals("it_IT.UTF8", c.getEncodingName()); + assertEquals("message\n", c.getFullMessage()); + assertEquals("message", c.getShortMessage()); + assertTrue(c.getFooterLines().isEmpty()); + assertEquals("au", c.getAuthorIdent().getName()); + + try { + c.getEncoding(); + fail("Expected " + UnsupportedCharsetException.class); + } catch (UnsupportedCharsetException badName) { + assertEquals("it_IT.UTF8", badName.getMessage()); + } + } + @Test public void testParse_NoMessage() throws Exception { final String msg = ""; diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java index 614f49bf0..82505caf2 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/revwalk/RevTagParseTest.java @@ -43,6 +43,7 @@ package org.eclipse.jgit.revwalk; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; @@ -361,6 +362,44 @@ public class RevTagParseTest extends RepositoryTestCase { assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); } + @Test + public void testParse_illegalEncoding() throws Exception { + ByteArrayOutputStream b = new ByteArrayOutputStream(); + b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); + b.write("type tree\n".getBytes(UTF_8)); + b.write("tag v1.0\n".getBytes(UTF_8)); + b.write("tagger t 1218123387 +0700\n".getBytes(UTF_8)); + b.write("encoding utf-8logoutputencoding=gbk\n".getBytes(UTF_8)); + b.write("\n".getBytes(UTF_8)); + b.write("message\n".getBytes(UTF_8)); + + RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); + t.parseCanonical(new RevWalk(db), b.toByteArray()); + + assertEquals("t", t.getTaggerIdent().getName()); + assertEquals("message", t.getShortMessage()); + assertEquals("message\n", t.getFullMessage()); + } + + @Test + public void testParse_unsupportedEncoding() throws Exception { + ByteArrayOutputStream b = new ByteArrayOutputStream(); + b.write("object 9788669ad918b6fcce64af8882fc9a81cb6aba67\n".getBytes(UTF_8)); + b.write("type tree\n".getBytes(UTF_8)); + b.write("tag v1.0\n".getBytes(UTF_8)); + b.write("tagger t 1218123387 +0700\n".getBytes(UTF_8)); + b.write("encoding it_IT.UTF8\n".getBytes(UTF_8)); + b.write("\n".getBytes(UTF_8)); + b.write("message\n".getBytes(UTF_8)); + + RevTag t = new RevTag(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); + t.parseCanonical(new RevWalk(db), b.toByteArray()); + + assertEquals("t", t.getTaggerIdent().getName()); + assertEquals("message", t.getShortMessage()); + assertEquals("message\n", t.getFullMessage()); + } + @Test public void testParse_NoMessage() throws Exception { final String msg = ""; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java index c23e4e328..e67ada602 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevCommit.java @@ -44,12 +44,17 @@ package org.eclipse.jgit.revwalk; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import org.eclipse.jgit.annotations.Nullable; import org.eclipse.jgit.errors.IncorrectObjectTypeException; import org.eclipse.jgit.errors.MissingObjectException; import org.eclipse.jgit.lib.AnyObjectId; @@ -441,12 +446,12 @@ public class RevCommit extends RevObject { * @return decoded commit message as a string. Never null. */ public final String getFullMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.commitMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.commitMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ - final Charset enc = RawParseUtils.parseEncoding(raw); - return RawParseUtils.decode(enc, raw, msgB, raw.length); + } + return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length); } /** @@ -465,16 +470,17 @@ public class RevCommit extends RevObject { * spanned multiple lines. Embedded LFs are converted to spaces. */ public final String getShortMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.commitMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.commitMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ + } - final Charset enc = RawParseUtils.parseEncoding(raw); - final int msgE = RawParseUtils.endOfParagraph(raw, msgB); - String str = RawParseUtils.decode(enc, raw, msgB, msgE); - if (hasLF(raw, msgB, msgE)) + int msgE = RawParseUtils.endOfParagraph(raw, msgB); + String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE); + if (hasLF(raw, msgB, msgE)) { str = StringUtils.replaceLineBreaksWithSpace(str); + } return str; } @@ -485,6 +491,23 @@ public class RevCommit extends RevObject { return false; } + /** + * Determine the encoding of the commit message buffer. + *

+ * Locates the "encoding" header (if present) and returns its value. Due to + * corruption in the wild this may be an invalid encoding name that is not + * recognized by any character encoding library. + *

+ * If no encoding header is present, null. + * + * @return the preferred encoding of {@link #getRawBuffer()}; or null. + * @since 4.2 + */ + @Nullable + public final String getEncodingName() { + return RawParseUtils.parseEncodingName(buffer); + } + /** * Determine the encoding of the commit message buffer. *

@@ -492,14 +515,28 @@ public class RevCommit extends RevObject { * character set to apply to this buffer to evaluate its contents as * character data. *

- * If no encoding header is present, {@link Constants#CHARSET} is assumed. + * If no encoding header is present {@code UTF-8} is assumed. * * @return the preferred encoding of {@link #getRawBuffer()}. + * @throws IllegalCharsetNameException + * if the character set requested by the encoding header is + * malformed and unsupportable. + * @throws UnsupportedCharsetException + * if the JRE does not support the character set requested by + * the encoding header. */ public final Charset getEncoding() { return RawParseUtils.parseEncoding(buffer); } + private Charset guessEncoding() { + try { + return getEncoding(); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + return UTF_8; + } + } + /** * Parse the footer lines (e.g. "Signed-off-by") for machine processing. *

@@ -529,7 +566,7 @@ public class RevCommit extends RevObject { final int msgB = RawParseUtils.commitMessage(raw, 0); final ArrayList r = new ArrayList(4); - final Charset enc = getEncoding(); + final Charset enc = guessEncoding(); for (;;) { ptr = RawParseUtils.prevLF(raw, ptr); if (ptr <= msgB) diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java index bf2785e0d..81a54bf7e 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/revwalk/RevTag.java @@ -45,8 +45,12 @@ package org.eclipse.jgit.revwalk; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.UnsupportedCharsetException; import org.eclipse.jgit.errors.CorruptObjectException; import org.eclipse.jgit.errors.IncorrectObjectTypeException; @@ -162,7 +166,7 @@ public class RevTag extends RevObject { int p = pos.value += 4; // "tag " final int nameEnd = RawParseUtils.nextLF(rawTag, p) - 1; - tagName = RawParseUtils.decode(Constants.CHARSET, rawTag, p, nameEnd); + tagName = RawParseUtils.decode(UTF_8, rawTag, p, nameEnd); if (walk.isRetainBody()) buffer = rawTag; @@ -207,12 +211,12 @@ public class RevTag extends RevObject { * @return decoded tag message as a string. Never null. */ public final String getFullMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.tagMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.tagMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ - final Charset enc = RawParseUtils.parseEncoding(raw); - return RawParseUtils.decode(enc, raw, msgB, raw.length); + } + return RawParseUtils.decode(guessEncoding(), raw, msgB, raw.length); } /** @@ -231,19 +235,28 @@ public class RevTag extends RevObject { * multiple lines. Embedded LFs are converted to spaces. */ public final String getShortMessage() { - final byte[] raw = buffer; - final int msgB = RawParseUtils.tagMessage(raw, 0); - if (msgB < 0) + byte[] raw = buffer; + int msgB = RawParseUtils.tagMessage(raw, 0); + if (msgB < 0) { return ""; //$NON-NLS-1$ + } - final Charset enc = RawParseUtils.parseEncoding(raw); - final int msgE = RawParseUtils.endOfParagraph(raw, msgB); - String str = RawParseUtils.decode(enc, raw, msgB, msgE); - if (RevCommit.hasLF(raw, msgB, msgE)) + int msgE = RawParseUtils.endOfParagraph(raw, msgB); + String str = RawParseUtils.decode(guessEncoding(), raw, msgB, msgE); + if (RevCommit.hasLF(raw, msgB, msgE)) { str = StringUtils.replaceLineBreaksWithSpace(str); + } return str; } + private Charset guessEncoding() { + try { + return RawParseUtils.parseEncoding(buffer); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + return UTF_8; + } + } + /** * Get a reference to the object this tag was placed on. *

diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java index a20e0b060..f2955f7e6 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/util/RawParseUtils.java @@ -44,6 +44,8 @@ package org.eclipse.jgit.util; +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.eclipse.jgit.lib.ObjectChecker.author; import static org.eclipse.jgit.lib.ObjectChecker.committer; import static org.eclipse.jgit.lib.ObjectChecker.encoding; @@ -60,6 +62,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; +import org.eclipse.jgit.annotations.Nullable; import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.PersonIdent; @@ -70,7 +73,7 @@ public final class RawParseUtils { * * @since 2.2 */ - public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$ + public static final Charset UTF8_CHARSET = UTF_8; private static final byte[] digits10; @@ -81,9 +84,9 @@ public final class RawParseUtils { private static final Map encodingAliases; static { - encodingAliases = new HashMap(); - encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$ - encodingAliases.put("'utf8'", Charset.forName("UTF-8")); //$NON-NLS-1$ //$NON-NLS-2$ + encodingAliases = new HashMap<>(); + encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ + encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ digits10 = new byte['9' + 1]; Arrays.fill(digits10, (byte) -1); @@ -671,6 +674,27 @@ public final class RawParseUtils { return match(b, ptr, encoding); } + /** + * Parse the "encoding " header as a string. + *

+ * Locates the "encoding " header (if present) and returns its value. + * + * @param b + * buffer to scan. + * @return the encoding header as specified in the commit; null if the + * header was not present and should be assumed. + * @since 4.2 + */ + @Nullable + public static String parseEncodingName(final byte[] b) { + int enc = encoding(b, 0); + if (enc < 0) { + return null; + } + int lf = nextLF(b, enc); + return decode(UTF_8, b, enc, lf - 1); + } + /** * Parse the "encoding " header into a character set reference. *

@@ -678,29 +702,33 @@ public final class RawParseUtils { * {@link #encoding(byte[], int)} and then returns the proper character set * to apply to this buffer to evaluate its contents as character data. *

- * If no encoding header is present, {@link Constants#CHARSET} is assumed. + * If no encoding header is present {@code UTF-8} is assumed. * * @param b * buffer to scan. * @return the Java character set representation. Never null. + * @throws IllegalCharsetNameException + * if the character set requested by the encoding header is + * malformed and unsupportable. + * @throws UnsupportedCharsetException + * if the JRE does not support the character set requested by + * the encoding header. */ public static Charset parseEncoding(final byte[] b) { - final int enc = encoding(b, 0); - if (enc < 0) - return Constants.CHARSET; - final int lf = nextLF(b, enc); - String decoded = decode(Constants.CHARSET, b, enc, lf - 1); + String enc = parseEncodingName(b); + if (enc == null) { + return UTF_8; + } + + String name = enc.trim(); try { - return Charset.forName(decoded); - } catch (IllegalCharsetNameException badName) { - Charset aliased = charsetForAlias(decoded); - if (aliased != null) - return aliased; - throw badName; - } catch (UnsupportedCharsetException badName) { - Charset aliased = charsetForAlias(decoded); - if (aliased != null) + return Charset.forName(name); + } catch (IllegalCharsetNameException + | UnsupportedCharsetException badName) { + Charset aliased = charsetForAlias(name); + if (aliased != null) { return aliased; + } throw badName; } } @@ -739,7 +767,15 @@ public final class RawParseUtils { * parsed. */ public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { - final Charset cs = parseEncoding(raw); + Charset cs; + try { + cs = parseEncoding(raw); + } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { + // Assume UTF-8 for person identities, usually this is correct. + // If not decode() will fall back to the ISO-8859-1 encoding. + cs = UTF_8; + } + final int emailB = nextLF(raw, nameB, '<'); final int emailE = nextLF(raw, emailB, '>'); if (emailB >= raw.length || raw[emailB] == '\n' || @@ -887,7 +923,7 @@ public final class RawParseUtils { */ public static String decode(final byte[] buffer, final int start, final int end) { - return decode(Constants.CHARSET, buffer, start, end); + return decode(UTF_8, buffer, start, end); } /** @@ -961,23 +997,21 @@ public final class RawParseUtils { public static String decodeNoFallback(final Charset cs, final byte[] buffer, final int start, final int end) throws CharacterCodingException { - final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); + ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); b.mark(); // Try our built-in favorite. The assumption here is that // decoding will fail if the data is not actually encoded // using that encoder. - // try { - return decode(b, Constants.CHARSET); + return decode(b, UTF_8); } catch (CharacterCodingException e) { b.reset(); } - if (!cs.equals(Constants.CHARSET)) { + if (!cs.equals(UTF_8)) { // Try the suggested encoding, it might be right since it was // provided by the caller. - // try { return decode(b, cs); } catch (CharacterCodingException e) { @@ -987,9 +1021,8 @@ public final class RawParseUtils { // Try the default character set. A small group of people // might actually use the same (or very similar) locale. - // - final Charset defcs = Charset.defaultCharset(); - if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) { + Charset defcs = Charset.defaultCharset(); + if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { try { return decode(b, defcs); } catch (CharacterCodingException e) {