Browse Source

fix utf decode

pull/3132/head
gongxuanzhang 2 years ago
parent
commit
5dc6196e50
  1. 57
      easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java
  2. 41
      easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java
  3. 43
      easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java
  4. BIN
      easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx

57
easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java

@ -1,10 +1,12 @@
package com.alibaba.excel.analysis.v07.handlers.sax;
import com.alibaba.excel.cache.ReadCache;
import com.alibaba.excel.constant.ExcelXmlConstants;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
import com.alibaba.excel.cache.ReadCache;
import com.alibaba.excel.constant.ExcelXmlConstants;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Sax read sharedStringsTable.xml
@ -13,6 +15,8 @@ import com.alibaba.excel.constant.ExcelXmlConstants;
*/
public class SharedStringsTableHandler extends DefaultHandler {
private static final Pattern UTF_PATTTERN = Pattern.compile("_x([0-9A-Fa-f]{4})_");
/**
* The final piece of data
*/
@ -86,7 +90,7 @@ public class SharedStringsTableHandler extends DefaultHandler {
if (currentData == null) {
readCache.put(null);
} else {
readCache.put(currentData.toString());
readCache.put(utfDecode(currentData.toString()));
}
break;
case ExcelXmlConstants.SHAREDSTRINGS_RPH_TAG:
@ -109,4 +113,51 @@ public class SharedStringsTableHandler extends DefaultHandler {
}
currentElementData.append(ch, start, length);
}
/**
* from poi XSSFRichTextString
*
* @param value the string to decode
* @return the decoded string or null if the input string is null
* <p>
* For all characters which cannot be represented in XML as defined by the XML 1.0 specification,
* the characters are escaped using the Unicode numerical character representation escape character
* format _xHHHH_, where H represents a hexadecimal character in the character's value.
* <p>
* Example: The Unicode character 0D is invalid in an XML 1.0 document,
* so it shall be escaped as <code>_x000D_</code>.
* </p>
* See section 3.18.9 in the OOXML spec.
* @see org.apache.poi.xssf.usermodel.XSSFRichTextString#utfDecode(String)
*/
static String utfDecode(String value) {
if (value == null || !value.contains("_x")) {
return value;
}
StringBuilder buf = new StringBuilder();
Matcher m = UTF_PATTTERN.matcher(value);
int idx = 0;
while (m.find()) {
int pos = m.start();
if (pos > idx) {
buf.append(value, idx, pos);
}
String code = m.group(1);
int icode = Integer.decode("0x" + code);
buf.append((char) icode);
idx = m.end();
}
// small optimization: don't go via StringBuilder if not necessary,
// the encodings are very rare, so we should almost always go via this shortcut.
if (idx == 0) {
return value;
}
buf.append(value.substring(idx));
return buf.toString();
}
}

41
easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java

@ -0,0 +1,41 @@
package com.alibaba.easyexcel.test.demo.rare;
import com.alibaba.easyexcel.test.util.TestFileUtil;
import com.alibaba.excel.EasyExcel;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.List;
import java.util.Map;
/**
*
* 记录一些不太常见的案例
* @author gxz gongxuanzhang@foxmail.com
**/
public class ReadTest {
/**
* 当excel有需要转义的 如x005特殊符号时需要通过utf decode解码
*
**/
@Test
public void readX005() throws Exception{
String fileName = TestFileUtil.pathBuild().sub("temp").sub("utfdecode").sub("demo.xlsx").getPath();
XSSFWorkbook xssfWorkbook = new XSSFWorkbook(fileName);
XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(0);
XSSFRow row = xssfSheet.getRow(0);
String poiValue = row.getCell(0).getStringCellValue();
List<Map<Integer,Object>> list = EasyExcel.read(fileName)
//.useDefaultListener(false)
.sheet(0)
.headRowNumber(0).doReadSync();
Map<Integer, Object> easyExcelRow = list.get(0);
Assert.assertEquals(easyExcelRow.get(0).toString(),poiValue);
}
}

43
easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java

@ -1,7 +1,11 @@
package com.alibaba.easyexcel.test.util;
import org.springframework.util.CollectionUtils;
import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class TestFileUtil {
@ -14,6 +18,10 @@ public class TestFileUtil {
return TestFileUtil.class.getResource("/").getPath();
}
public static TestPathBuild pathBuild() {
return new TestPathBuild();
}
public static File createNewFile(String pathName) {
File file = new File(getPath() + pathName);
if (file.exists()) {
@ -33,4 +41,39 @@ public class TestFileUtil {
public static File readUserHomeFile(String pathName) {
return new File(System.getProperty("user.home") + File.separator + pathName);
}
/**
* build to test file path
**/
public static class TestPathBuild {
private TestPathBuild() {
subPath = new ArrayList<>();
}
private final List<String> subPath;
public TestPathBuild sub(String dirOrFile) {
subPath.add(dirOrFile);
return this;
}
public String getPath() {
if (CollectionUtils.isEmpty(subPath)) {
return TestFileUtil.class.getResource("/").getPath();
}
if (subPath.size() == 1) {
return TestFileUtil.class.getResource("/").getPath() + subPath.get(0);
}
StringBuilder path = new StringBuilder(TestFileUtil.class.getResource("/").getPath());
path.append(subPath.get(0));
for (int i = 1; i < subPath.size(); i++) {
path.append(File.separator).append(subPath.get(i));
}
return path.toString();
}
}
}

BIN
easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx

Binary file not shown.
Loading…
Cancel
Save