diff --git a/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java index 203db6c7..169429de 100644 --- a/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java +++ b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java @@ -1,10 +1,12 @@ package com.alibaba.excel.analysis.v07.handlers.sax; +import com.alibaba.excel.cache.ReadCache; +import com.alibaba.excel.constant.ExcelXmlConstants; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; -import com.alibaba.excel.cache.ReadCache; -import com.alibaba.excel.constant.ExcelXmlConstants; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Sax read sharedStringsTable.xml @@ -13,6 +15,8 @@ import com.alibaba.excel.constant.ExcelXmlConstants; */ public class SharedStringsTableHandler extends DefaultHandler { + private static final Pattern UTF_PATTTERN = Pattern.compile("_x([0-9A-Fa-f]{4})_"); + /** * The final piece of data */ @@ -86,7 +90,7 @@ public class SharedStringsTableHandler extends DefaultHandler { if (currentData == null) { readCache.put(null); } else { - readCache.put(currentData.toString()); + readCache.put(utfDecode(currentData.toString())); } break; case ExcelXmlConstants.SHAREDSTRINGS_RPH_TAG: @@ -109,4 +113,51 @@ public class SharedStringsTableHandler extends DefaultHandler { } currentElementData.append(ch, start, length); } + + /** + * from poi XSSFRichTextString + * + * @param value the string to decode + * @return the decoded string or null if the input string is null + *

+ * For all characters which cannot be represented in XML as defined by the XML 1.0 specification, + * the characters are escaped using the Unicode numerical character representation escape character + * format _xHHHH_, where H represents a hexadecimal character in the character's value. + *

+ * Example: The Unicode character 0D is invalid in an XML 1.0 document, + * so it shall be escaped as _x000D_. + *

+ * See section 3.18.9 in the OOXML spec. + * @see org.apache.poi.xssf.usermodel.XSSFRichTextString#utfDecode(String) + */ + static String utfDecode(String value) { + if (value == null || !value.contains("_x")) { + return value; + } + + StringBuilder buf = new StringBuilder(); + Matcher m = UTF_PATTTERN.matcher(value); + int idx = 0; + while (m.find()) { + int pos = m.start(); + if (pos > idx) { + buf.append(value, idx, pos); + } + + String code = m.group(1); + int icode = Integer.decode("0x" + code); + buf.append((char) icode); + + idx = m.end(); + } + + // small optimization: don't go via StringBuilder if not necessary, + // the encodings are very rare, so we should almost always go via this shortcut. + if (idx == 0) { + return value; + } + + buf.append(value.substring(idx)); + return buf.toString(); + } } diff --git a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java new file mode 100644 index 00000000..c1e614ea --- /dev/null +++ b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java @@ -0,0 +1,41 @@ +package com.alibaba.easyexcel.test.demo.rare; + +import com.alibaba.easyexcel.test.util.TestFileUtil; +import com.alibaba.excel.EasyExcel; +import org.apache.poi.xssf.usermodel.XSSFRow; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.List; +import java.util.Map; + +/** + * + * 记录一些不太常见的案例 + * @author gxz gongxuanzhang@foxmail.com + **/ +public class ReadTest { + + + /** + * 当excel有需要转义的 如x005特殊符号时需要通过utf decode解码 + * + **/ + @Test + public void readX005() throws Exception{ + String fileName = TestFileUtil.pathBuild().sub("temp").sub("utfdecode").sub("demo.xlsx").getPath(); + XSSFWorkbook xssfWorkbook = new XSSFWorkbook(fileName); + XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(0); + XSSFRow row = xssfSheet.getRow(0); + String poiValue = row.getCell(0).getStringCellValue(); + List> list = EasyExcel.read(fileName) + //.useDefaultListener(false) + .sheet(0) + .headRowNumber(0).doReadSync(); + Map easyExcelRow = list.get(0); + Assert.assertEquals(easyExcelRow.get(0).toString(),poiValue); + } +} diff --git a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java index d4665b44..b0ab7ebb 100644 --- a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java +++ b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java @@ -1,7 +1,11 @@ package com.alibaba.easyexcel.test.util; +import org.springframework.util.CollectionUtils; + import java.io.File; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; public class TestFileUtil { @@ -14,6 +18,10 @@ public class TestFileUtil { return TestFileUtil.class.getResource("/").getPath(); } + public static TestPathBuild pathBuild() { + return new TestPathBuild(); + } + public static File createNewFile(String pathName) { File file = new File(getPath() + pathName); if (file.exists()) { @@ -33,4 +41,39 @@ public class TestFileUtil { public static File readUserHomeFile(String pathName) { return new File(System.getProperty("user.home") + File.separator + pathName); } + + + /** + * build to test file path + **/ + public static class TestPathBuild { + private TestPathBuild() { + subPath = new ArrayList<>(); + } + + private final List subPath; + + public TestPathBuild sub(String dirOrFile) { + subPath.add(dirOrFile); + return this; + } + + public String getPath() { + if (CollectionUtils.isEmpty(subPath)) { + return TestFileUtil.class.getResource("/").getPath(); + } + if (subPath.size() == 1) { + return TestFileUtil.class.getResource("/").getPath() + subPath.get(0); + } + StringBuilder path = new StringBuilder(TestFileUtil.class.getResource("/").getPath()); + path.append(subPath.get(0)); + for (int i = 1; i < subPath.size(); i++) { + path.append(File.separator).append(subPath.get(i)); + } + return path.toString(); + } + + } + + } diff --git a/easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx b/easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx new file mode 100644 index 00000000..0b29141d Binary files /dev/null and b/easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx differ