From 6dcf356dba3d79c61601648052a19dda5bcad874 Mon Sep 17 00:00:00 2001 From: supalle Date: Sat, 4 Mar 2023 13:57:03 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0CSV=E7=9A=84BOM=E8=A7=A3?= =?UTF-8?q?=E6=9E=90=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../analysis/csv/BomBufferedInputStream.java | 91 ++++++++++++++ .../excel/analysis/csv/ByteOrderMark.java | 112 ++++++++++++++++++ .../holder/csv/CsvReadWorkbookHolder.java | 34 +++++- .../easyexcel/test/core/bom/BomData.java | 16 +++ .../easyexcel/test/core/bom/BomDataTest.java | 53 +++++++++ .../src/test/resources/bom/bom_none.csv | 11 ++ .../src/test/resources/bom/bom_utf16be.csv | Bin 0 -> 152 bytes .../src/test/resources/bom/bom_utf16le.csv | Bin 0 -> 152 bytes .../src/test/resources/bom/bom_utf8.csv | 11 ++ 9 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/BomBufferedInputStream.java create mode 100644 easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/ByteOrderMark.java create mode 100644 easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomData.java create mode 100644 easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomDataTest.java create mode 100644 easyexcel-test/src/test/resources/bom/bom_none.csv create mode 100644 easyexcel-test/src/test/resources/bom/bom_utf16be.csv create mode 100644 easyexcel-test/src/test/resources/bom/bom_utf16le.csv create mode 100644 easyexcel-test/src/test/resources/bom/bom_utf8.csv diff --git a/easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/BomBufferedInputStream.java b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/BomBufferedInputStream.java new file mode 100644 index 00000000..9a53520d --- /dev/null +++ b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/BomBufferedInputStream.java @@ -0,0 +1,91 @@ +package com.alibaba.excel.analysis.csv; + + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. + * + * @author supalle + * @see Byte Order Mark (BOM) FAQ + * @see Apache CommonsIO BOMInputStream + */ +public class BomBufferedInputStream extends BufferedInputStream { + public final static List DEFAULT_BYTE_ORDER_MARKS = new ArrayList<>(); + + static { + DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_8); + DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_16BE); + DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_16LE); + DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_32BE); + DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_32LE); + } + + private boolean initialized; + private ByteOrderMark byteOrderMark; + private final List byteOrderMarks; + + public BomBufferedInputStream(InputStream in, final ByteOrderMark... byteOrderMarks) { + super(in); + this.byteOrderMarks = applyByteOrderMarks(byteOrderMarks); + } + + public BomBufferedInputStream(InputStream in, int size, final ByteOrderMark... byteOrderMarks) { + super(in, size); + this.byteOrderMarks = applyByteOrderMarks(byteOrderMarks); + } + + private static List applyByteOrderMarks(ByteOrderMark[] byteOrderMarks) { + return byteOrderMarks == null || byteOrderMarks.length == 0 ? DEFAULT_BYTE_ORDER_MARKS : Arrays.asList(byteOrderMarks); + } + + public boolean hasByteOrderMark() throws IOException { + return getByteOrderMark() != null; + } + + public ByteOrderMark getByteOrderMark() throws IOException { + if (initialized) { + return byteOrderMark; + } + this.byteOrderMarks.sort(ByteOrderMark::compareTo); + int maxBomLength = byteOrderMarks.get(0).length(); + mark(maxBomLength); + int[] firstBytes = new int[maxBomLength]; + for (int i = 0; i < maxBomLength; i++) { + firstBytes[i] = read(); + if (firstBytes[i] < 0) { + break; + } + } + byteOrderMark = matchByteOrderMark(this.byteOrderMarks, firstBytes); + + reset(); + if (byteOrderMark != null) { + // read(new byte[byteOrderMark.length()]); + skip(byteOrderMark.length()); + } + initialized = true; + return byteOrderMark; + } + + private ByteOrderMark matchByteOrderMark(final List byteOrderMarks, final int[] firstBytes) { + loop: + for (ByteOrderMark item : byteOrderMarks) { + int[] bytes = item.getBytes(); + int length = bytes.length; + for (int i = 0; i < length; i++) { + if (firstBytes[i] != bytes[i]) { + continue loop; + } + } + return item; + } + return null; + } + +} diff --git a/easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/ByteOrderMark.java b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/ByteOrderMark.java new file mode 100644 index 00000000..0c68351b --- /dev/null +++ b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/ByteOrderMark.java @@ -0,0 +1,112 @@ +package com.alibaba.excel.analysis.csv; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Byte Order Mark (BOM) + *
+ * User in {@link BomBufferedInputStream} + * + * @author supalle + * @see Byte Order Mark (BOM) FAQ + * @see Apache CommonsIO ByteOrderMark + */ +public class ByteOrderMark implements Comparable { + + /** + * UTF-8 BOM. + */ + public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8, 0xEF, 0xBB, 0xBF); + + /** + * UTF-16BE BOM (Big-Endian). + */ + public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE, 0xFE, 0xFF); + + /** + * UTF-16LE BOM (Little-Endian). + */ + public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE, 0xFF, 0xFE); + + /** + * UTF-32BE BOM (Big-Endian). + * + * @since 2.2 + */ + public static final ByteOrderMark UTF_32BE = new ByteOrderMark(Charset.forName("UTF-32BE"), 0x00, 0x00, 0xFE, 0xFF); + + /** + * UTF-32LE BOM (Little-Endian). + * + * @since 2.2 + */ + public static final ByteOrderMark UTF_32LE = new ByteOrderMark(Charset.forName("UTF-32LE"), 0xFF, 0xFE, 0x00, 0x00); + + /** + * Unicode BOM character; external form depends on the encoding. + * + * @see Byte Order Mark (BOM) FAQ + * @since 2.5 + */ + public static final char UTF_BOM = '\uFEFF'; + + private final Charset charset; + private final int[] bytes; + + public ByteOrderMark(final Charset charset, final int... bytes) { + this.charset = Objects.requireNonNull(charset, "charset must be not null"); + if (bytes == null || bytes.length == 0) { + throw new IllegalArgumentException("bytes must be not empty"); + } + this.bytes = bytes; + } + + public Charset getCharset() { + return charset; + } + + public int[] getBytes() { + return bytes; + } + + public int length() { + return bytes.length; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + ByteOrderMark that = (ByteOrderMark) o; + return Objects.equals(charset, that.charset) && Arrays.equals(bytes, that.bytes); + } + + @Override + public int hashCode() { + int result = Objects.hash(charset); + result = 31 * result + Arrays.hashCode(bytes); + return result; + } + + @Override + public String toString() { + return "ByteOrderMark{" + + "charset=" + charset + + ", bytes=[" + + Arrays.stream(bytes) + .mapToObj(Integer::toHexString) + .map(String::toUpperCase) + .map("0x"::concat) + .collect(Collectors.joining(",")) + + "]}"; + } + + @Override + public int compareTo(ByteOrderMark o) { + return o.length() - length(); + } +} diff --git a/easyexcel-core/src/main/java/com/alibaba/excel/read/metadata/holder/csv/CsvReadWorkbookHolder.java b/easyexcel-core/src/main/java/com/alibaba/excel/read/metadata/holder/csv/CsvReadWorkbookHolder.java index 90ebe2b5..04aeac4e 100644 --- a/easyexcel-core/src/main/java/com/alibaba/excel/read/metadata/holder/csv/CsvReadWorkbookHolder.java +++ b/easyexcel-core/src/main/java/com/alibaba/excel/read/metadata/holder/csv/CsvReadWorkbookHolder.java @@ -1,15 +1,18 @@ package com.alibaba.excel.read.metadata.holder.csv; +import com.alibaba.excel.analysis.csv.BomBufferedInputStream; import com.alibaba.excel.read.metadata.ReadWorkbook; import com.alibaba.excel.read.metadata.holder.ReadWorkbookHolder; import com.alibaba.excel.support.ExcelTypeEnum; - import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; +import java.io.IOException; +import java.nio.file.Files; + /** * Workbook holder * @@ -27,5 +30,34 @@ public class CsvReadWorkbookHolder extends ReadWorkbookHolder { super(readWorkbook); setExcelType(ExcelTypeEnum.CSV); this.csvFormat = CSVFormat.DEFAULT; + // CSV BOM + if (readWorkbook.getCharset() == null) { + BomBufferedInputStream bomBufferedInputStream = buildBomBufferedInputStream(); + setInputStream(bomBufferedInputStream); + setMandatoryUseInputStream(Boolean.TRUE); + try { + if (bomBufferedInputStream.hasByteOrderMark()) { + setCharset(bomBufferedInputStream.getByteOrderMark().getCharset()); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private BomBufferedInputStream buildBomBufferedInputStream() { + BomBufferedInputStream bomBufferedInputStream; + try { + if (Boolean.TRUE.equals(getMandatoryUseInputStream())) { + bomBufferedInputStream = new BomBufferedInputStream(getInputStream()); + } else if (getFile() != null) { + bomBufferedInputStream = new BomBufferedInputStream(Files.newInputStream(getFile().toPath())); + } else { + bomBufferedInputStream = new BomBufferedInputStream(getInputStream()); + } + } catch (IOException e) { + throw new RuntimeException(e.getMessage()); + } + return bomBufferedInputStream; } } diff --git a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomData.java b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomData.java new file mode 100644 index 00000000..24d8ece8 --- /dev/null +++ b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomData.java @@ -0,0 +1,16 @@ +package com.alibaba.easyexcel.test.core.bom; + +import com.alibaba.excel.annotation.ExcelProperty; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; + +@Getter +@Setter +@EqualsAndHashCode +public class BomData { + @ExcelProperty("姓名") + private String name; + @ExcelProperty("年纪") + private Integer age; +} diff --git a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomDataTest.java b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomDataTest.java new file mode 100644 index 00000000..0c4ceacc --- /dev/null +++ b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomDataTest.java @@ -0,0 +1,53 @@ +package com.alibaba.easyexcel.test.core.bom; + +import com.alibaba.easyexcel.test.util.TestFileUtil; +import com.alibaba.excel.EasyExcel; +import com.alibaba.excel.context.AnalysisContext; +import com.alibaba.excel.metadata.data.ReadCellData; +import com.alibaba.excel.read.listener.ReadListener; +import org.apache.commons.compress.utils.Lists; +import org.junit.Assert; +import org.junit.FixMethodOrder; +import org.junit.Test; +import org.junit.runners.MethodSorters; + +import java.io.File; +import java.util.List; +import java.util.Map; + +@FixMethodOrder(MethodSorters.NAME_ASCENDING) +public class BomDataTest { + @Test + public void t01ReadAndWriteCsv() { + readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_none.csv")); + readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_utf8.csv")); + readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_utf16be.csv")); + readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_utf16le.csv")); + } + + private void readCsv(File file) { + EasyExcel.read(file, BomData.class, new ReadListener() { + + private final List dataList = Lists.newArrayList(); + + @Override + public void invokeHead(Map> headMap, AnalysisContext context) { + String head = headMap.get(0).getStringValue(); + Assert.assertEquals("姓名", head); + } + + @Override + public void invoke(BomData data, AnalysisContext context) { + dataList.add(data); + } + + @Override + public void doAfterAllAnalysed(AnalysisContext context) { + Assert.assertEquals(dataList.size(), 10); + BomData bomData = dataList.get(0); + Assert.assertEquals("姓名0", bomData.getName()); + Assert.assertEquals(0, (long) bomData.getAge()); + } + }).sheet().doRead(); + } +} diff --git a/easyexcel-test/src/test/resources/bom/bom_none.csv b/easyexcel-test/src/test/resources/bom/bom_none.csv new file mode 100644 index 00000000..26d73e1f --- /dev/null +++ b/easyexcel-test/src/test/resources/bom/bom_none.csv @@ -0,0 +1,11 @@ +姓名,年纪 +姓名0,0 +姓名1,1 +姓名2,2 +姓名3,3 +姓名4,4 +姓名5,5 +姓名6,6 +姓名7,7 +姓名8,8 +姓名9,9 \ No newline at end of file diff --git a/easyexcel-test/src/test/resources/bom/bom_utf16be.csv b/easyexcel-test/src/test/resources/bom/bom_utf16be.csv new file mode 100644 index 0000000000000000000000000000000000000000..ad13f8716eaf7e8219509eb773a657cadfc3cec0 GIT binary patch literal 152 zcmX}fxei_@% literal 0 HcmV?d00001 diff --git a/easyexcel-test/src/test/resources/bom/bom_utf16le.csv b/easyexcel-test/src/test/resources/bom/bom_utf16le.csv new file mode 100644 index 0000000000000000000000000000000000000000..8e8eba909c4cbf4eafa298a5014930dd2c7a051f GIT binary patch literal 152 zcmX}fsS$uM00h9(%%qcu{7)a47^*<0ptM6jz#+JG``p(syTy5|X|Um&gN4IC&v`9y YL5p0}5|^~hWvy^Ut6bF@*R;-cyk{mG>i_@% literal 0 HcmV?d00001 diff --git a/easyexcel-test/src/test/resources/bom/bom_utf8.csv b/easyexcel-test/src/test/resources/bom/bom_utf8.csv new file mode 100644 index 00000000..358d5bac --- /dev/null +++ b/easyexcel-test/src/test/resources/bom/bom_utf8.csv @@ -0,0 +1,11 @@ +姓名,年纪 +姓名0,0 +姓名1,1 +姓名2,2 +姓名3,3 +姓名4,4 +姓名5,5 +姓名6,6 +姓名7,7 +姓名8,8 +姓名9,9 \ No newline at end of file