Browse Source

添加CSV的BOM解析支持

pull/3019/head
supalle 2 years ago
parent
commit
6dcf356dba
  1. 91
      easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/BomBufferedInputStream.java
  2. 112
      easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/ByteOrderMark.java
  3. 34
      easyexcel-core/src/main/java/com/alibaba/excel/read/metadata/holder/csv/CsvReadWorkbookHolder.java
  4. 16
      easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomData.java
  5. 53
      easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomDataTest.java
  6. 11
      easyexcel-test/src/test/resources/bom/bom_none.csv
  7. BIN
      easyexcel-test/src/test/resources/bom/bom_utf16be.csv
  8. BIN
      easyexcel-test/src/test/resources/bom/bom_utf16le.csv
  9. 11
      easyexcel-test/src/test/resources/bom/bom_utf8.csv

91
easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/BomBufferedInputStream.java

@ -0,0 +1,91 @@
package com.alibaba.excel.analysis.csv;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
*
* @author supalle
* @see <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
* @see <a href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/ByteOrderMark.html">Apache CommonsIO BOMInputStream</a>
*/
public class BomBufferedInputStream extends BufferedInputStream {
public final static List<ByteOrderMark> DEFAULT_BYTE_ORDER_MARKS = new ArrayList<>();
static {
DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_8);
DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_16BE);
DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_16LE);
DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_32BE);
DEFAULT_BYTE_ORDER_MARKS.add(ByteOrderMark.UTF_32LE);
}
private boolean initialized;
private ByteOrderMark byteOrderMark;
private final List<ByteOrderMark> byteOrderMarks;
public BomBufferedInputStream(InputStream in, final ByteOrderMark... byteOrderMarks) {
super(in);
this.byteOrderMarks = applyByteOrderMarks(byteOrderMarks);
}
public BomBufferedInputStream(InputStream in, int size, final ByteOrderMark... byteOrderMarks) {
super(in, size);
this.byteOrderMarks = applyByteOrderMarks(byteOrderMarks);
}
private static List<ByteOrderMark> applyByteOrderMarks(ByteOrderMark[] byteOrderMarks) {
return byteOrderMarks == null || byteOrderMarks.length == 0 ? DEFAULT_BYTE_ORDER_MARKS : Arrays.asList(byteOrderMarks);
}
public boolean hasByteOrderMark() throws IOException {
return getByteOrderMark() != null;
}
public ByteOrderMark getByteOrderMark() throws IOException {
if (initialized) {
return byteOrderMark;
}
this.byteOrderMarks.sort(ByteOrderMark::compareTo);
int maxBomLength = byteOrderMarks.get(0).length();
mark(maxBomLength);
int[] firstBytes = new int[maxBomLength];
for (int i = 0; i < maxBomLength; i++) {
firstBytes[i] = read();
if (firstBytes[i] < 0) {
break;
}
}
byteOrderMark = matchByteOrderMark(this.byteOrderMarks, firstBytes);
reset();
if (byteOrderMark != null) {
// read(new byte[byteOrderMark.length()]);
skip(byteOrderMark.length());
}
initialized = true;
return byteOrderMark;
}
private ByteOrderMark matchByteOrderMark(final List<ByteOrderMark> byteOrderMarks, final int[] firstBytes) {
loop:
for (ByteOrderMark item : byteOrderMarks) {
int[] bytes = item.getBytes();
int length = bytes.length;
for (int i = 0; i < length; i++) {
if (firstBytes[i] != bytes[i]) {
continue loop;
}
}
return item;
}
return null;
}
}

112
easyexcel-core/src/main/java/com/alibaba/excel/analysis/csv/ByteOrderMark.java

@ -0,0 +1,112 @@
package com.alibaba.excel.analysis.csv;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Objects;
import java.util.stream.Collectors;
/**
* Byte Order Mark (BOM)
* <br/>
* User in {@link BomBufferedInputStream}
*
* @author supalle
* @see <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
* @see <a href="https://commons.apache.org/proper/commons-io/apidocs/org/apache/commons/io/ByteOrderMark.html">Apache CommonsIO ByteOrderMark</a>
*/
public class ByteOrderMark implements Comparable<ByteOrderMark> {
/**
* UTF-8 BOM.
*/
public static final ByteOrderMark UTF_8 = new ByteOrderMark(StandardCharsets.UTF_8, 0xEF, 0xBB, 0xBF);
/**
* UTF-16BE BOM (Big-Endian).
*/
public static final ByteOrderMark UTF_16BE = new ByteOrderMark(StandardCharsets.UTF_16BE, 0xFE, 0xFF);
/**
* UTF-16LE BOM (Little-Endian).
*/
public static final ByteOrderMark UTF_16LE = new ByteOrderMark(StandardCharsets.UTF_16LE, 0xFF, 0xFE);
/**
* UTF-32BE BOM (Big-Endian).
*
* @since 2.2
*/
public static final ByteOrderMark UTF_32BE = new ByteOrderMark(Charset.forName("UTF-32BE"), 0x00, 0x00, 0xFE, 0xFF);
/**
* UTF-32LE BOM (Little-Endian).
*
* @since 2.2
*/
public static final ByteOrderMark UTF_32LE = new ByteOrderMark(Charset.forName("UTF-32LE"), 0xFF, 0xFE, 0x00, 0x00);
/**
* Unicode BOM character; external form depends on the encoding.
*
* @see <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a>
* @since 2.5
*/
public static final char UTF_BOM = '\uFEFF';
private final Charset charset;
private final int[] bytes;
public ByteOrderMark(final Charset charset, final int... bytes) {
this.charset = Objects.requireNonNull(charset, "charset must be not null");
if (bytes == null || bytes.length == 0) {
throw new IllegalArgumentException("bytes must be not empty");
}
this.bytes = bytes;
}
public Charset getCharset() {
return charset;
}
public int[] getBytes() {
return bytes;
}
public int length() {
return bytes.length;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ByteOrderMark that = (ByteOrderMark) o;
return Objects.equals(charset, that.charset) && Arrays.equals(bytes, that.bytes);
}
@Override
public int hashCode() {
int result = Objects.hash(charset);
result = 31 * result + Arrays.hashCode(bytes);
return result;
}
@Override
public String toString() {
return "ByteOrderMark{" +
"charset=" + charset +
", bytes=["
+ Arrays.stream(bytes)
.mapToObj(Integer::toHexString)
.map(String::toUpperCase)
.map("0x"::concat)
.collect(Collectors.joining(",")) +
"]}";
}
@Override
public int compareTo(ByteOrderMark o) {
return o.length() - length();
}
}

34
easyexcel-core/src/main/java/com/alibaba/excel/read/metadata/holder/csv/CsvReadWorkbookHolder.java

@ -1,15 +1,18 @@
package com.alibaba.excel.read.metadata.holder.csv; package com.alibaba.excel.read.metadata.holder.csv;
import com.alibaba.excel.analysis.csv.BomBufferedInputStream;
import com.alibaba.excel.read.metadata.ReadWorkbook; import com.alibaba.excel.read.metadata.ReadWorkbook;
import com.alibaba.excel.read.metadata.holder.ReadWorkbookHolder; import com.alibaba.excel.read.metadata.holder.ReadWorkbookHolder;
import com.alibaba.excel.support.ExcelTypeEnum; import com.alibaba.excel.support.ExcelTypeEnum;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
import lombok.Getter; import lombok.Getter;
import lombok.Setter; import lombok.Setter;
import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVParser;
import java.io.IOException;
import java.nio.file.Files;
/** /**
* Workbook holder * Workbook holder
* *
@ -27,5 +30,34 @@ public class CsvReadWorkbookHolder extends ReadWorkbookHolder {
super(readWorkbook); super(readWorkbook);
setExcelType(ExcelTypeEnum.CSV); setExcelType(ExcelTypeEnum.CSV);
this.csvFormat = CSVFormat.DEFAULT; this.csvFormat = CSVFormat.DEFAULT;
// CSV BOM
if (readWorkbook.getCharset() == null) {
BomBufferedInputStream bomBufferedInputStream = buildBomBufferedInputStream();
setInputStream(bomBufferedInputStream);
setMandatoryUseInputStream(Boolean.TRUE);
try {
if (bomBufferedInputStream.hasByteOrderMark()) {
setCharset(bomBufferedInputStream.getByteOrderMark().getCharset());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
private BomBufferedInputStream buildBomBufferedInputStream() {
BomBufferedInputStream bomBufferedInputStream;
try {
if (Boolean.TRUE.equals(getMandatoryUseInputStream())) {
bomBufferedInputStream = new BomBufferedInputStream(getInputStream());
} else if (getFile() != null) {
bomBufferedInputStream = new BomBufferedInputStream(Files.newInputStream(getFile().toPath()));
} else {
bomBufferedInputStream = new BomBufferedInputStream(getInputStream());
}
} catch (IOException e) {
throw new RuntimeException(e.getMessage());
}
return bomBufferedInputStream;
} }
} }

16
easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomData.java

@ -0,0 +1,16 @@
package com.alibaba.easyexcel.test.core.bom;
import com.alibaba.excel.annotation.ExcelProperty;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
@EqualsAndHashCode
public class BomData {
@ExcelProperty("姓名")
private String name;
@ExcelProperty("年纪")
private Integer age;
}

53
easyexcel-test/src/test/java/com/alibaba/easyexcel/test/core/bom/BomDataTest.java

@ -0,0 +1,53 @@
package com.alibaba.easyexcel.test.core.bom;
import com.alibaba.easyexcel.test.util.TestFileUtil;
import com.alibaba.excel.EasyExcel;
import com.alibaba.excel.context.AnalysisContext;
import com.alibaba.excel.metadata.data.ReadCellData;
import com.alibaba.excel.read.listener.ReadListener;
import org.apache.commons.compress.utils.Lists;
import org.junit.Assert;
import org.junit.FixMethodOrder;
import org.junit.Test;
import org.junit.runners.MethodSorters;
import java.io.File;
import java.util.List;
import java.util.Map;
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
public class BomDataTest {
@Test
public void t01ReadAndWriteCsv() {
readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_none.csv"));
readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_utf8.csv"));
readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_utf16be.csv"));
readCsv(TestFileUtil.readFile("bom" + File.separator + "bom_utf16le.csv"));
}
private void readCsv(File file) {
EasyExcel.read(file, BomData.class, new ReadListener<BomData>() {
private final List<BomData> dataList = Lists.newArrayList();
@Override
public void invokeHead(Map<Integer, ReadCellData<?>> headMap, AnalysisContext context) {
String head = headMap.get(0).getStringValue();
Assert.assertEquals("姓名", head);
}
@Override
public void invoke(BomData data, AnalysisContext context) {
dataList.add(data);
}
@Override
public void doAfterAllAnalysed(AnalysisContext context) {
Assert.assertEquals(dataList.size(), 10);
BomData bomData = dataList.get(0);
Assert.assertEquals("姓名0", bomData.getName());
Assert.assertEquals(0, (long) bomData.getAge());
}
}).sheet().doRead();
}
}

11
easyexcel-test/src/test/resources/bom/bom_none.csv

@ -0,0 +1,11 @@
姓名,年纪
姓名0,0
姓名1,1
姓名2,2
姓名3,3
姓名4,4
姓名5,5
姓名6,6
姓名7,7
姓名8,8
姓名9,9
1 姓名 年纪
2 姓名0 0
3 姓名1 1
4 姓名2 2
5 姓名3 3
6 姓名4 4
7 姓名5 5
8 姓名6 6
9 姓名7 7
10 姓名8 8
11 姓名9 9

BIN
easyexcel-test/src/test/resources/bom/bom_utf16be.csv

Binary file not shown.
1 姓名 年纪
2 姓名0 0
3 姓名1 1
4 姓名2 2
5 姓名3 3
6 姓名4 4
7 姓名5 5
8 姓名6 6
9 姓名7 7
10 姓名8 8
11 姓名9 9

BIN
easyexcel-test/src/test/resources/bom/bom_utf16le.csv

Binary file not shown.
1 姓名 年纪
2 姓名0 0
3 姓名1 1
4 姓名2 2
5 姓名3 3
6 姓名4 4
7 姓名5 5
8 姓名6 6
9 姓名7 7
10 姓名8 8
11 姓名9 9

11
easyexcel-test/src/test/resources/bom/bom_utf8.csv

@ -0,0 +1,11 @@
姓名,年纪
姓名0,0
姓名1,1
姓名2,2
姓名3,3
姓名4,4
姓名5,5
姓名6,6
姓名7,7
姓名8,8
姓名9,9
1 姓名 年纪
2 姓名0 0
3 姓名1 1
4 姓名2 2
5 姓名3 3
6 姓名4 4
7 姓名5 5
8 姓名6 6
9 姓名7 7
10 姓名8 8
11 姓名9 9
Loading…
Cancel
Save