From 5dc6196e509db434d30beaaebbd5757ace25e055 Mon Sep 17 00:00:00 2001 From: gongxuanzhang Date: Fri, 21 Apr 2023 11:47:55 +0800 Subject: [PATCH] fix utf decode --- .../sax/SharedStringsTableHandler.java | 57 +++++++++++++++++- .../easyexcel/test/demo/rare/ReadTest.java | 41 +++++++++++++ .../easyexcel/test/util/TestFileUtil.java | 43 +++++++++++++ .../test/resources/temp/utfdecode/demo.xlsx | Bin 0 -> 8762 bytes 4 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java create mode 100644 easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx diff --git a/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java index 203db6c7..169429de 100644 --- a/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java +++ b/easyexcel-core/src/main/java/com/alibaba/excel/analysis/v07/handlers/sax/SharedStringsTableHandler.java @@ -1,10 +1,12 @@ package com.alibaba.excel.analysis.v07.handlers.sax; +import com.alibaba.excel.cache.ReadCache; +import com.alibaba.excel.constant.ExcelXmlConstants; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; -import com.alibaba.excel.cache.ReadCache; -import com.alibaba.excel.constant.ExcelXmlConstants; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Sax read sharedStringsTable.xml @@ -13,6 +15,8 @@ import com.alibaba.excel.constant.ExcelXmlConstants; */ public class SharedStringsTableHandler extends DefaultHandler { + private static final Pattern UTF_PATTTERN = Pattern.compile("_x([0-9A-Fa-f]{4})_"); + /** * The final piece of data */ @@ -86,7 +90,7 @@ public class SharedStringsTableHandler extends DefaultHandler { if (currentData == null) { readCache.put(null); } else { - readCache.put(currentData.toString()); + readCache.put(utfDecode(currentData.toString())); } break; case ExcelXmlConstants.SHAREDSTRINGS_RPH_TAG: @@ -109,4 +113,51 @@ public class SharedStringsTableHandler extends DefaultHandler { } currentElementData.append(ch, start, length); } + + /** + * from poi XSSFRichTextString + * + * @param value the string to decode + * @return the decoded string or null if the input string is null + *

+ * For all characters which cannot be represented in XML as defined by the XML 1.0 specification, + * the characters are escaped using the Unicode numerical character representation escape character + * format _xHHHH_, where H represents a hexadecimal character in the character's value. + *

+ * Example: The Unicode character 0D is invalid in an XML 1.0 document, + * so it shall be escaped as _x000D_. + *

+ * See section 3.18.9 in the OOXML spec. + * @see org.apache.poi.xssf.usermodel.XSSFRichTextString#utfDecode(String) + */ + static String utfDecode(String value) { + if (value == null || !value.contains("_x")) { + return value; + } + + StringBuilder buf = new StringBuilder(); + Matcher m = UTF_PATTTERN.matcher(value); + int idx = 0; + while (m.find()) { + int pos = m.start(); + if (pos > idx) { + buf.append(value, idx, pos); + } + + String code = m.group(1); + int icode = Integer.decode("0x" + code); + buf.append((char) icode); + + idx = m.end(); + } + + // small optimization: don't go via StringBuilder if not necessary, + // the encodings are very rare, so we should almost always go via this shortcut. + if (idx == 0) { + return value; + } + + buf.append(value.substring(idx)); + return buf.toString(); + } } diff --git a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java new file mode 100644 index 00000000..c1e614ea --- /dev/null +++ b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/demo/rare/ReadTest.java @@ -0,0 +1,41 @@ +package com.alibaba.easyexcel.test.demo.rare; + +import com.alibaba.easyexcel.test.util.TestFileUtil; +import com.alibaba.excel.EasyExcel; +import org.apache.poi.xssf.usermodel.XSSFRow; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.util.List; +import java.util.Map; + +/** + * + * 记录一些不太常见的案例 + * @author gxz gongxuanzhang@foxmail.com + **/ +public class ReadTest { + + + /** + * 当excel有需要转义的 如x005特殊符号时需要通过utf decode解码 + * + **/ + @Test + public void readX005() throws Exception{ + String fileName = TestFileUtil.pathBuild().sub("temp").sub("utfdecode").sub("demo.xlsx").getPath(); + XSSFWorkbook xssfWorkbook = new XSSFWorkbook(fileName); + XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(0); + XSSFRow row = xssfSheet.getRow(0); + String poiValue = row.getCell(0).getStringCellValue(); + List> list = EasyExcel.read(fileName) + //.useDefaultListener(false) + .sheet(0) + .headRowNumber(0).doReadSync(); + Map easyExcelRow = list.get(0); + Assert.assertEquals(easyExcelRow.get(0).toString(),poiValue); + } +} diff --git a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java index d4665b44..b0ab7ebb 100644 --- a/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java +++ b/easyexcel-test/src/test/java/com/alibaba/easyexcel/test/util/TestFileUtil.java @@ -1,7 +1,11 @@ package com.alibaba.easyexcel.test.util; +import org.springframework.util.CollectionUtils; + import java.io.File; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; public class TestFileUtil { @@ -14,6 +18,10 @@ public class TestFileUtil { return TestFileUtil.class.getResource("/").getPath(); } + public static TestPathBuild pathBuild() { + return new TestPathBuild(); + } + public static File createNewFile(String pathName) { File file = new File(getPath() + pathName); if (file.exists()) { @@ -33,4 +41,39 @@ public class TestFileUtil { public static File readUserHomeFile(String pathName) { return new File(System.getProperty("user.home") + File.separator + pathName); } + + + /** + * build to test file path + **/ + public static class TestPathBuild { + private TestPathBuild() { + subPath = new ArrayList<>(); + } + + private final List subPath; + + public TestPathBuild sub(String dirOrFile) { + subPath.add(dirOrFile); + return this; + } + + public String getPath() { + if (CollectionUtils.isEmpty(subPath)) { + return TestFileUtil.class.getResource("/").getPath(); + } + if (subPath.size() == 1) { + return TestFileUtil.class.getResource("/").getPath() + subPath.get(0); + } + StringBuilder path = new StringBuilder(TestFileUtil.class.getResource("/").getPath()); + path.append(subPath.get(0)); + for (int i = 1; i < subPath.size(); i++) { + path.append(File.separator).append(subPath.get(i)); + } + return path.toString(); + } + + } + + } diff --git a/easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx b/easyexcel-test/src/test/resources/temp/utfdecode/demo.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..0b29141d21d2234296ed280cc89af3b5bed8c181 GIT binary patch literal 8762 zcmai41yq#X)+Pt(?vR!)>28D}hVD-3?vU;d0i`9Sq&ua%q)S4O9zbb+(0jk&e)s-o zE#`%p9dn+2_C9;<2M`b%1_9!6h01>wetiCW!9VjPA zn72N>dktU+pEq%2FaR1L(n#L%#T9<`0Nx`rzCXM;c5OXH*DxzwnU$oE&>qa6=**!H zXH4cyAyV7-{+KiRc|1io;QMH$5_R^lR(gH*0Bp!M8HrKve1_-_V6#QFlLqJjG6%}( z&yvvg$IgEHS>HES_*Pb2p!IxfoVRo&{{4>-jeWxAH=ogieY$FM2#!m1iiQ0k6hb=$ z#Jc3TXR#sj)5O|*H@J7z>n<*RE@A%;GcM=+Tcrn>tsh`U|0~SKc8;cxI7cT;$aJ${ zg&e`I=4`gfO(MsSyJlC>hX!ExG{?QS?c}7Du37YQKiaq?C^+EcI}`1f1K)q+yXSet znAa0-FKL1&7hVM2!tlf79+5!G&;^FXs1u`y4g01A913H^X|-&PEFR*|FEs|OP`I!O zuCg^e3+}6IQ5kc>-MSb=ykW_okbkaQ&Ngg)O|F0}4ANNx@V{MhT7usA@k-q51D2G@ zwb>pdDQpMd1tol`)F`8=WtCdpDBF9t8%LiuPQCf- z-p0%KNcU<`o~b6Afl;EIobryKOtV?8Eevd0X>-=JK3dw{_wh;S;7KmQ;-l&q)RDGZ zzp$DQN%=`5yFu;{SYn4$_ln-oe-D7g#d90lhX9~H1OW4I0dR40wzGK*L4Le0@OKFA z#rl2H+2v+kYl!#ygj+x$H+m|$deOur>I{^V_IIp>?(5NdBrDzBr4jA8)ScTC>h*s235TiA2+o4$Fw&;t{st zR7op@TVz3ML!I+ObFMJ0(5(j4MUshB5(BXI zNX#V)jLAIRF)-`zeQR_I@vFp80sXPleX*!-e=2;bqzEc%{pcRQ_1<5(ZNgUlBuj`> zJ&QY%MwcSm4Rm52cGKp2Uryvb*M|au&)T4Ig7Qfp0bLbNt#|n68qMUD0aXH_EBOyd zlSC3t3c=0GvPvaBK~{!tZEo*HOIWK46dWRXO2{bj$>)5_eQ5{99+>)=o5HQ5o}-lI zh5g|b4UOaGwFrV&I4`%#Qa|Xrdb~Ki+np*w%)Lh%8mtsLQx{)ScG9+C7c6u&7kN{` z-_2qo_Wb3Rp2~aVB8_JqGi15-R^bV=rVAZnXfE>>>0cW>ak9%Uz6D^WcD1u3oeZ1N z)W;eKld(I$cprVe>e>DrvmGNT_A{;FheU*PbTtE6Bh!;HO-P3l(A$b1IyQ+++hmmh zEf1Ck@OHgzWnPKO57?W-Q08nmVvMGKTZO7Vls&c!|A`;8KC%sMIhcoA?s+{`q-mB& zN)^`P+<8f_c0P>)9VjIt1dUQ_<&s*LI=S#&7bMKG^SM0g))-_75~AA_cBDq@NM0{h z&rZxhCl6O!jcOe^xLV99TPF%T_UzV|+(Z5MQ}uX#J)A0M3sW1@C)by}bu{%cECd8S z{_nK@L+qFEucLCH3A7{RLTkg_@zdCEjW^jt$B&2Bj%|p}fQe5am$H1jVES1Xfzni} zo|Fe076I#3qz(dB>lp|Oc^m?HLF^u=OZjT*`2?9;EjbmPZ$~J5*nVKj^~i7^{vp@q z_rV!B_;VCi+@O5McE{pgRiVH_E7(^LAvDOnz!XwTMFl$h-JxQ)wM%sC@2{nrbX5| ziI8%DN6?#hf?XaE(d`M_hdI+S*76+gKyFzq#iF zqHo9k3vpdBVo+1vk{f*8pidRr667U2EhA1?)61kC0 zBuECG&+&C9+=*G3_&<%={ad|y)aA+Jn`45w453D&Jm+?`*LT@@3=CbjOI4;X*)UQ4 zH;r0!=zG^&NMAa)e&UwGKIB+@mF4ygCh0))6<((`#yb>@=*w)G1VLS~Ng~R#VWuF7 zsm`xejzY{c2#DEgDzJ5uB-TY*zGVp11~8%2;toaMlaf94bajU#OV652wCJ;uj<~?s zl||0V4!V2-9L?6n8M%`<@Wzevl+I?(sF4sE?f3S6$@9G_M}NcWUtY;V&Hq{7UW9(vw6Puxb0slg{|^BO&RO+bUx<3~r9lqR)K zKMICJiajH58Udxa(V=2OjW@?_E)x3;L2jcer!d-zrlVY3vRgC})-n?tNwhubNEIOk z-PAl4jdua|X!F9&je+Sv0qD?|h{QvTcdH(q;geQgE(RV3B(po6CZ`zG5hWs-c4>Ql zZ_-ftr4pzj$C_wDItmbh zd^jLU=M-YWGtkzo^{VC%@n;Y=~JU&ZFKHFv+U ze!X2Q48XE^TSik^9{g5-VdV?5Rb1_Wkn#X{>Ml6LOIj}zsH@~`typtzknr{LF2iYT zt980}Bru0+`Q=jl?i^!dm0k)}_aTqZ)QZi3-O9u)K@~fg*R|c4r*0wvH9Fqg>2z^* z0nZ~e#ldTgg&$sHE-$6!o5wYmOunMs7@n)--e{X)Hy3mtu`55fiyJV{XLjdID#*ej zImxk6jlfk_du6GzPt0d0`a|Z+y4IA^Pr!ux5!5St#YCzs?Zw?dI=B3zJL`S-irb^D zPk#2gO+Q+#W(W7Sz`DB}o%KSR!26YNRonN?9*Jz1ce>I{!B>W+(+zB_yR-SJ4SW7g z_?Pq!cSgp_jM#nyX=%15ROR9Zl=((D0P}EfpolXU7a)chR#TC5aC30GcgJQ|uMg^m z_Vc_zpQFwIG@eKj{q$`{&ibmu)!FjYH|AYkFJd>Col|(#Xq~({#6zO=x{ZWLDt$>= zjIBK&+d-Ry0YukeOQ~xgTXT6Dr8SwpTm52!i*CE>DpQ}vi&T*#PY#OFt@6mBMis9n zN@p?{kO9;?6+s60wvRe6P~X6MX3`rl%}@qO>k6|k@!kC2z;>zV&g|6=Vbygs0W@a& zVW|x>MM`wb`9{tzW?aCHvTt2KbLvR8M;#@DTWWrzmAw%n31e%$!b%^RCf>5Qrli@^Ud+0%5yTlRM0#cfr+$bYc;-nz zRiyz$&dc*Nbuw<>@CdQmJFdkdTIE>^)f)=I$tnVoCm|lKxbTHy2dfoZ!%+0GU5iI9 za~>RTe|P1&2;eZ`SeQwU>(We)973ofDL~7VHpAOGx5T`gmaHydCroDJFunx8``kK& z3!PXuE{k1Ss{S$BGQZd&`TZsZ*Ejm-A|=bC%d;-WE_q5;>;!UhWuSc~)-S_fT$|IB zb*$)nRXGyc6VA#iGz6>ss5F{Ysj|GmC5?3Yp&t1=#{+$8DixG}HMDOvi!3s<$&Ab@vl2Bv%kXG+R7Tz%#%&uvp8~0ZF^6T3w zYuYqGnzMNX2E5{Mvm>N`M?4~p-bl1)q-Gge8>R&YBCVTOIzeX~6otxIW$B^`jfYi7 z2yGdB&!NGaI{J<}l&U+qI>k6K_ZC8;E3pK>szyk}mF6Ggm4>Rz_5#&XUpBq$EiVvv z;Q~$SK=NXBbofw0Bp89 zJz?pp*A$hy!L7(WDi|J0{xJoqGF_OFDTzLrIH-+x4<{uC71c zkV86ku(W`fx09!HQpNF0#N;K+wHpiAM{&kI^=6W1i2EZexAmmmN-AN?gb&5V@FF}6 zPluGX;*3i1ltxkoCF`Cv2wXHLOaUq-je0NWfpdXPwk z{XCsDvsBYC{kHlNM+S5&50-k6x(Z^0c~G|Z0?m*>lC~{euw{vo23M{IeegkC{0sVE zpL7?A?}d`zCz1&jq@mzHS~y(!A}N)V`+uj!rd=&bn-&iQ)AE9S1{^<9)4j6iUfl@i zEA(y+6GaZ~nPz>Ri!|#L;|yQXe*N*pf-JLT-S^JLeb?Z)1QA>b9z!7gKGL9xs)Gs} z)&i@q{QT+^*;VrGMOQUi{5sEIIzqi@`?rSeMNr&;Bvg}=SmvrG>Fv_N{wgbqfyl(_ z4~aT_P_h07xZbfxL&H&K%)7WnZk-lflFTjEVIaEX3znyLx9#L-())b<@d|| z=conkLh1ZI-aqDFRJ*muWnY4Cp}&w(t2V{ck4i{?g}OX&B5R9P@yD+b#`VXE%A_RA z{kb}Y%;f{qJErH&;^9stLzz#N`({FdR1r8y`1#T^QIGO#?j%n zpB+LESWf(BbHnm?-k#u8>Fpf2p+YDOe>n0PkLgY(0?5GX>-M|td1dOvP-0nJ7|QSg z64=<#Eb7l4n-^BpDN>o_EkcS6FH_|^$zOD{r3+v03dI)uyeE_?Komz0zbCOHDobQm zn7idKF*u5N_4(V+LOY3U|FQ;M8IS7uhHnpkPzs3&)T5?tO$_546DBwm&k)p$g00OI z?%6!<1xP3?k@Q7D)M{#+;+_3(wyeHd-_Gho77RPK@ppPXgBrq|fCt~CmxH&8=}ah! zN=w^B2s?_IoOO+0Qb`Qh%#^Wt9Z^_JTw?(mhT*-hBpO9=n9zzVSzt5=3`9vrDG1y{ z7>+%1%Z5!&w26)K;OAkg@WpHJCO_8|oRzbH?UjP(lZ*I27G*PEJr*frV|Ny z8?ys^JieEq9zA;$?!kaNpz*BrK>OT$30J+FvuT;)ie>5R5=C7rwfef*5-l$)FKxS& zWXJ2bajcu_2ew2iHI4dpgy{ma_&m^dt#B1r+=2e{FSq>`%QY=S+HK)vZKD_(hLEF> zliG=Otp;{Jt7kPKP6G;qC6grg)qir{qyD{*Eo|0G$b0BtyB~fYr}qzo4krskM^h7J zXUEsJ=D*rmj+jn_E*3QLqkwy~ALZe&5|q(!5-lHbnK2g1klEiYEX&+_ZFCNGY_JZ` z-D2^b_xg+FVyFS35sGD8vd!V^_g;U3?fF{#LzkRkJrK!@D@)T%u@R^vNbu%v2QLW%ao`D`RV|v|;Fy6~Efn z_jOEcGi&h7*V4P!843+grZ?95n)gHx)4qYn!R+r%Jx&IV?Ch+54fzy@jKR2q|gw8V*){wi(TpelPKnr=UCoy8JBmW~sP#Z)+DHct3xYz7$pyTnI! zTZM?!<2yOLrt2dA*XUE#D)6ST$^qqlKtmJ;qMHqo*WU$YtD)R)=^crmAz9WVT)P*e z#+?t!%(mw)xP&vb3`JVcZTW3WP1JBnn)OAlz)?6lx`okuCV@e8GF>v;6x|+W=uMh% zoIwuuobPZ?9lqWmp8q->|2$#^JOHWhXlng*wCcs}+H(K!+3Ldt9Ptksrbo%LgfW@V zENH!tB+qu6oEtX!yfZWTzMP5An^f8Racay2&4ad95k-dSCb-zN%k6io-X_h2%cOd& z1u-oFE{XxdcKxnxipSpW8p{E%N?T{3Jyh(wV4A;!jjn?NxA~J6m?-0Qy_Lq?QrCN_>WkbsTn7pm*L}JKDJf0KhZm=ps0D!d*o7hYU*28XZ(bA&Cn~}AgbKGdrG#x&zmV8f*ks<9Knnq-TvRn<3GWHRE>;DFli z>a+TDG_#%G>L6tWwI970T3l{CnntCzG-d&r@X#PJU2#qW+E9*AnX|)ej_V>Jk)vK! z(jOhwJ)hG|0Z!LI(>lPLWE+y`)l$zI*wIEp19nwd&r>7{v{1hg+eCXNejDSxJ%8i4 zUi@Q4c;Wg5DAXZt^S;$swH9SH_xh0aIQf}ewA_vm&wx8+FXv1@0B1`!Oyr^a*TD|y zlh<=9NIfgN7)d{2;OpY?N$}*2)JEz_M=DH>nI;7fJJ7~k19@@rS}z>7o+NnDiI$W3 zLvkZOjA$QO7i}>+TW3>SXMGh9ds8Rf$0RRH8j^=%#R~aJ-PZj<`=zqIVU(f4x@j@x zSLo><9qeZ*nE}T?GHDHj)?z{!%JU%Wof31xa5Ch9nx1^{FABU>S8sb7V5> zu=8r=V`>GiDWz#4a}x1)sxZjXWvZWfZBPb?sPWt4)Rm@(qeZ2V$1Arve}AK73y-b3 z%`vz0hRY=IBrnKy+pVMSFgiC}ON14NwG{!#*#LGV@)4E~DQQ=YnLC>*P7x+R1fr{q z*Odw<`%{upBMW8Tn*g7$J6<5`y&MHJIoh2M@FOe`n0!ojxr=j~c(7qCce2Z=GK$^S_)uvH`!>BVYZv{z7g4-QQDk z`%(C}zZZX1`xn*y6r!h`?Vs8)4@deBr~hVfpQ=A)9RE~DeIU2~RR4p2{9l`X(D>JS z-1KXhf6;7@+x_Q|{2e78!oL)#r(g7GC3|%A8zqCks{Kb@dur!ti}I(PG2DM{Se{yV zI;{KC0^Gyw?y=?hV|4ew!20{%+aE5(KWL$+9-s0@e|p^htJ9mw*U2|ADPkL>oLL+za#vp{0C3^_p|+!DSg!a9ksgu(EXD)1p#3n`-6x7K|qIq Mdype0dHn1D09Q%wxBvhE literal 0 HcmV?d00001