diff --git a/docs/LARGEREAD.md b/docs/LARGEREAD.md new file mode 100644 index 00000000..512596d4 --- /dev/null +++ b/docs/LARGEREAD.md @@ -0,0 +1,22 @@ +# 10M以上文件读取说明 +03版没有办法处理,相对内存占用大很多。excel 07版本有个共享字符串[共享字符串](https://docs.microsoft.com/zh-cn/office/open-xml/working-with-the-shared-string-table)的概念,这个会非常占用内存,如果全部读取到内存的话,大概是excel文件的大小的3-10倍,所以easyexcel用存储文件的,然后再反序列化去读取的策略来节约内存。当然需要通过文件反序列化以后,效率会降低,大概降低30-50%(不一定,也看命中率,可能会超过100%) +## 如果对读取效率感觉还能接受,就用默认的,永久占用(单个excel读取整个过程)一般不会超过50M(大概率就30M),剩下临时的GC会很快回收 +## 默认大文件处理 +默认大文件处理会自动判断,共享字符串5M以下会使用内存存储,大概占用15-50M的内存,超过5M则使用文件存储,然后文件存储也要设置多内存M用来存放临时的共享字符串,默认20M。除了共享字符串占用内存外,其他占用较少,所以可以预估10M,所以默认大概30M就能读取一个超级大的文件。 +## 根据实际需求配置内存 +想自定义设置,首先要确定你大概愿意花多少内存来读取一个超级大的excel,比如希望读取excel最多占用100M内存(是读取过程中永久占用,新生代马上回收的不算),那就设置使用文件来存储共享字符串的大小判断为20M(小于20M存内存,大于存临时文件),然后设置文件存储时临时共享字符串占用内存大小90M差不多 +### 如果最大文件条数也就十几二十万,然后excel也就是十几二十M,而且不会有很高的并发,并且内存也较大 +```java + // 强制使用内存存储,这样大概一个20M的excel使用150M(很多临时对象,所以100M会一直GC)的内存 +// 这样效率会比上面的复杂的策略高很多 + EasyExcel.read().readCache(new MapCache()); +``` +### 对并发要求较高,而且都是经常有超级大文件 +```java + // 第一个参数的意思是 多少M共享字符串以后 采用文件存储 单位MB 默认5M +// 第二个参数 文件存储时,内存存放多少M缓存数据 默认20M +// 比如 你希望用100M内存(这里说的是解析过程中的永久占用,临时对象不算)来解析excel,前面算过了 大概是 20M+90M 所以设置参数为:20 和 90 +EasyExcel.read().readCacheSelector(new SimpleReadCacheSelector(5, 20)); +``` +### 关于maxCacheActivateSize 也就是前面第二个参数的详细说明 +easyexcel在使用文件存储的时候,会把共享字符串拆分成1000条一批,然后放到文件存储。然后excel来读取共享字符串大概率是按照顺序的,所以默认20M的1000条的数据放在内存,命中后直接返回,没命中去读文件。所以不能设置太小,太小了,很难命中,一直去读取文件,太大了的话会占用过多的内存。 \ No newline at end of file diff --git a/pom.xml b/pom.xml index 52bfae04..8777c67a 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ 4.0.0 com.alibaba easyexcel - 2.0.4 + 2.0.5 jar easyexcel diff --git a/quickstart.md b/quickstart.md index ecee7607..11f6eb10 100644 --- a/quickstart.md +++ b/quickstart.md @@ -14,6 +14,7 @@ ClassNotFoundException与java.lang.NoClassDefFoundError的区别 * 出现`NoSuchMethodException`,`ClassNotFoundException`,`NoClassDefFoundError`极大概率是jar冲突,建议`clean`项目,或者统一`poi` 的版本,理论上来说`easyexcel`兼容poi的`3.17`,`4.0.1`,`4.1.0`所有较新版本 * 如果在读的时候`Listener`里面需要使用spring的`@Autowired`,给`Listener`创建成员变量,然后在构造方法里面传进去。而别必须不让spring管理`Listener`,每次读取都要`new`一个。 * 如果用`String`去接收数字,出现小数点等情况,这个是BUG,但是很难修复,后续版本会修复这个问题。目前请使用`@NumberFormat`直接,里面的参数就是调用了java自带的`NumberFormat.format`方法,不知道怎么入参的可以自己网上查询。 +* 10M以上xlsx读取很慢或者内存占用大 请先阅读[10M以上文件读取说明](/docs/LARGEREAD.md) #### 详细参数介绍 有些参数不知道怎么用,或者有些功能不知道用什么参数,参照:[详细参数介绍](/docs/API.md) #### 开源项目不容易,如果觉得本项目对您的工作还是有帮助的话,请在右上角帮忙点个★Star。 diff --git a/src/main/java/com/alibaba/excel/analysis/ExcelAnalyserImpl.java b/src/main/java/com/alibaba/excel/analysis/ExcelAnalyserImpl.java index dcae5f1b..cf2dff97 100644 --- a/src/main/java/com/alibaba/excel/analysis/ExcelAnalyserImpl.java +++ b/src/main/java/com/alibaba/excel/analysis/ExcelAnalyserImpl.java @@ -119,7 +119,7 @@ public class ExcelAnalyserImpl implements ExcelAnalyser { } try { if (readWorkbookHolder.getOpcPackage() != null) { - readWorkbookHolder.getOpcPackage().close(); + readWorkbookHolder.getOpcPackage().revert(); } } catch (Throwable t) { throwCanNotCloseIo(t); diff --git a/src/main/java/com/alibaba/excel/analysis/v03/XlsSaxAnalyser.java b/src/main/java/com/alibaba/excel/analysis/v03/XlsSaxAnalyser.java index 19aa2563..36d2b74b 100644 --- a/src/main/java/com/alibaba/excel/analysis/v03/XlsSaxAnalyser.java +++ b/src/main/java/com/alibaba/excel/analysis/v03/XlsSaxAnalyser.java @@ -2,7 +2,6 @@ package com.alibaba.excel.analysis.v03; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; @@ -60,11 +59,9 @@ import com.alibaba.excel.util.CollectionUtils; public class XlsSaxAnalyser implements HSSFListener, ExcelExecutor { private static final Logger LOGGER = LoggerFactory.getLogger(XlsSaxAnalyser.class); - private boolean outputFormulaValues = true; private POIFSFileSystem poifsFileSystem; private int lastRowNumber; private int lastColumnNumber; - private boolean notAllEmpty = false; /** * For parsing Formulas */ @@ -105,11 +102,7 @@ public class XlsSaxAnalyser implements HSSFListener, ExcelExecutor { init(); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); - if (outputFormulaValues) { - request.addListenerForAllRecords(formatListener); - } else { - request.addListenerForAllRecords(workbookBuildingListener); - } + request.addListenerForAllRecords(formatListener); try { factory.processWorkbookEvents(request, poifsFileSystem); } catch (IOException e) { @@ -145,20 +138,17 @@ public class XlsSaxAnalyser implements HSSFListener, ExcelExecutor { cellData = handler.getCellData(); if (cellData != null) { cellData.checkEmpty(); - records.put(thisColumn, cellData); + if (CellDataTypeEnum.EMPTY != cellData.getType()) { + records.put(thisColumn, cellData); + } } break; } } // If we got something to print out, do so - if (cellData != null) { - if (analysisContext.currentReadHolder().globalConfiguration().getAutoTrim() - && CellDataTypeEnum.STRING == cellData.getType()) { - cellData.setStringValue(cellData.getStringValue().trim()); - } - if (CellDataTypeEnum.EMPTY != cellData.getType()) { - notAllEmpty = true; - } + if (cellData != null && analysisContext.currentReadHolder().globalConfiguration().getAutoTrim() + && CellDataTypeEnum.STRING == cellData.getType()) { + cellData.setStringValue(cellData.getStringValue().trim()); } // Handle new row @@ -193,11 +183,9 @@ public class XlsSaxAnalyser implements HSSFListener, ExcelExecutor { if (lastColumnNumber == -1) { lastColumnNumber = 0; } - if (notAllEmpty) { - analysisContext.readRowHolder( - new ReadRowHolder(lastRowNumber, analysisContext.readSheetHolder().getGlobalConfiguration())); - analysisContext.readSheetHolder().notifyEndOneRow(new EachRowAnalysisFinishEvent(records), analysisContext); - } + analysisContext.readRowHolder( + new ReadRowHolder(lastRowNumber, analysisContext.readSheetHolder().getGlobalConfiguration())); + analysisContext.readSheetHolder().notifyEndOneRow(new EachRowAnalysisFinishEvent(records), analysisContext); records.clear(); lastColumnNumber = -1; } diff --git a/src/main/java/com/alibaba/excel/analysis/v07/XlsxSaxAnalyser.java b/src/main/java/com/alibaba/excel/analysis/v07/XlsxSaxAnalyser.java index 2a8b36ec..932fb650 100644 --- a/src/main/java/com/alibaba/excel/analysis/v07/XlsxSaxAnalyser.java +++ b/src/main/java/com/alibaba/excel/analysis/v07/XlsxSaxAnalyser.java @@ -20,15 +20,12 @@ import org.apache.poi.xssf.usermodel.XSSFRelation; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbook; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTWorkbookPr; import org.openxmlformats.schemas.spreadsheetml.x2006.main.WorkbookDocument; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.XMLReader; import com.alibaba.excel.analysis.ExcelExecutor; -import com.alibaba.excel.cache.Ehcache; -import com.alibaba.excel.cache.MapCache; +import com.alibaba.excel.cache.ReadCache; import com.alibaba.excel.context.AnalysisContext; import com.alibaba.excel.exception.ExcelAnalysisException; import com.alibaba.excel.read.metadata.ReadSheet; @@ -41,11 +38,7 @@ import com.alibaba.excel.util.FileUtils; * @author jipengfei */ public class XlsxSaxAnalyser implements ExcelExecutor { - private static final Logger LOGGER = LoggerFactory.getLogger(XlsxSaxAnalyser.class); - /** - * If it's less than 5M, use map cache, or use ehcache. - */ - private static final long USE_MAP_CACHE_SIZE = 5 * 1000 * 1000L; + private AnalysisContext analysisContext; private List sheetList; private Map sheetMap; @@ -93,28 +86,10 @@ public class XlsxSaxAnalyser implements ExcelExecutor { } } - private void defaultReadCache(ReadWorkbookHolder readWorkbookHolder, PackagePart sharedStringsTablePackagePart) - throws IOException { - if (readWorkbookHolder.getReadCache() != null) { - readWorkbookHolder.getReadCache().init(analysisContext); - return; - } - long size = sharedStringsTablePackagePart.getSize(); - if (size < 0) { - size = sharedStringsTablePackagePart.getInputStream().available(); - } - if (size < USE_MAP_CACHE_SIZE) { - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("Use map cache.size:{}", size); - } - readWorkbookHolder.setReadCache(new MapCache()); - } else { - if (LOGGER.isDebugEnabled()) { - LOGGER.debug("Use ehcache.size:{}", size); - } - readWorkbookHolder.setReadCache(new Ehcache()); - } - readWorkbookHolder.getReadCache().init(analysisContext); + private void defaultReadCache(ReadWorkbookHolder readWorkbookHolder, PackagePart sharedStringsTablePackagePart) { + ReadCache readCache = readWorkbookHolder.getReadCacheSelector().readCache(sharedStringsTablePackagePart); + readWorkbookHolder.setReadCache(readCache); + readCache.init(analysisContext); } private void analysisUse1904WindowDate(XSSFReader xssfReader, ReadWorkbookHolder readWorkbookHolder) diff --git a/src/main/java/com/alibaba/excel/cache/Ehcache.java b/src/main/java/com/alibaba/excel/cache/Ehcache.java index abf0169a..4e757fcd 100644 --- a/src/main/java/com/alibaba/excel/cache/Ehcache.java +++ b/src/main/java/com/alibaba/excel/cache/Ehcache.java @@ -2,15 +2,10 @@ package com.alibaba.excel.cache; import java.io.File; import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.Map; -import java.util.Set; import java.util.UUID; import org.ehcache.CacheManager; -import org.ehcache.PersistentCacheManager; +import org.ehcache.config.CacheConfiguration; import org.ehcache.config.builders.CacheConfigurationBuilder; import org.ehcache.config.builders.CacheManagerBuilder; import org.ehcache.config.builders.ResourcePoolsBuilder; @@ -19,8 +14,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.alibaba.excel.context.AnalysisContext; +import com.alibaba.excel.util.CollectionUtils; import com.alibaba.excel.util.FileUtils; -import com.alibaba.excel.util.StringUtils; /** * Default cache @@ -30,79 +25,64 @@ import com.alibaba.excel.util.StringUtils; public class Ehcache implements ReadCache { private static final Logger LOGGER = LoggerFactory.getLogger(Ehcache.class); - private static final int BATCH_COUNT = 1000; - private static final int CHECK_INTERVAL = 500; - private static final int MAX_CACHE_ACTIVATE = 10; - - private static final String CACHE = "cache"; - private static final String DATA_SEPARATOR = "@"; - private static final String KEY_VALUE_SEPARATOR = "!"; - private static final String SPECIAL_SEPARATOR = "&"; - private static final String ESCAPED_DATA_SEPARATOR = "&d;"; - private static final String ESCAPED_KEY_VALUE_SEPARATOR = "&kv;"; - private static final String ESCAPED_SPECIAL_SEPARATOR = "&s;"; - private static final int DEBUG_WRITE_SIZE = 100 * 10000; private static final int DEBUG_CACHE_MISS_SIZE = 1000; - /** * Key index */ private int index = 0; - private StringBuilder data = new StringBuilder(); - private CacheManager cacheManager; + private HashMap dataMap = new HashMap(BATCH_COUNT * 4 / 3 + 1); + private static CacheManager fileCacheManager; + private static CacheConfiguration fileCacheConfiguration; + private static CacheManager activeCacheManager; + private CacheConfiguration activeCacheConfiguration; /** * Bulk storage data */ - private org.ehcache.Cache cache; + private org.ehcache.Cache fileCache; /** * Currently active cache */ - private Map> cacheMap = new HashMap>(); - /** - * Count how many times get - */ - private int getCount = 0; - /** - * Count active cache - * - */ - private LinkedList countList = new LinkedList(); - - /** - * Count the last {@link #CHECK_INTERVAL} used - */ - private Set lastCheckIntervalUsedSet = new HashSet(); - + private org.ehcache.Cache activeCache; + private String cacheAlias; /** * Count the number of cache misses */ private int cacheMiss = 0; + public Ehcache(int maxCacheActivateSize) { + activeCacheConfiguration = CacheConfigurationBuilder + .newCacheConfigurationBuilder(Integer.class, HashMap.class, + ResourcePoolsBuilder.newResourcePoolsBuilder().heap(maxCacheActivateSize, MemoryUnit.MB)) + .withSizeOfMaxObjectGraph(1000 * 1000L).withSizeOfMaxObjectSize(maxCacheActivateSize, MemoryUnit.MB) + .build(); + } + + static { + File cacheFile = FileUtils.createCacheTmpFile(); + fileCacheManager = + CacheManagerBuilder.newCacheManagerBuilder().with(CacheManagerBuilder.persistence(cacheFile)).build(true); + activeCacheManager = CacheManagerBuilder.newCacheManagerBuilder().build(true); + fileCacheConfiguration = CacheConfigurationBuilder + .newCacheConfigurationBuilder(Integer.class, HashMap.class, + ResourcePoolsBuilder.newResourcePoolsBuilder().disk(10, MemoryUnit.GB)) + .withSizeOfMaxObjectGraph(1000 * 1000L).withSizeOfMaxObjectSize(10, MemoryUnit.GB).build(); + } + @Override public void init(AnalysisContext analysisContext) { - File readTempFile = analysisContext.readWorkbookHolder().getTempFile(); - if (readTempFile == null) { - readTempFile = FileUtils.createCacheTmpFile(); - analysisContext.readWorkbookHolder().setTempFile(readTempFile); - } - File cacheFile = new File(readTempFile.getPath(), UUID.randomUUID().toString()); - PersistentCacheManager persistentCacheManager = - CacheManagerBuilder.newCacheManagerBuilder().with(CacheManagerBuilder.persistence(cacheFile)) - .withCache(CACHE, CacheConfigurationBuilder.newCacheConfigurationBuilder(Integer.class, String.class, - ResourcePoolsBuilder.newResourcePoolsBuilder().disk(10, MemoryUnit.GB))) - .build(true); - cacheManager = persistentCacheManager; - cache = persistentCacheManager.getCache(CACHE, Integer.class, String.class); + cacheAlias = UUID.randomUUID().toString(); + fileCache = fileCacheManager.createCache(cacheAlias, fileCacheConfiguration); + activeCache = activeCacheManager.createCache(cacheAlias, activeCacheConfiguration); } @Override public void put(String value) { - data.append(index).append(KEY_VALUE_SEPARATOR).append(escape(value)).append(DATA_SEPARATOR); + dataMap.put(index, value); if ((index + 1) % BATCH_COUNT == 0) { - cache.put(index / BATCH_COUNT, data.toString()); - data = new StringBuilder(); + fileCache.put(index / BATCH_COUNT, dataMap); + dataMap = new HashMap(BATCH_COUNT * 4 / 3 + 1); } index++; if (LOGGER.isDebugEnabled()) { @@ -112,102 +92,37 @@ public class Ehcache implements ReadCache { } } - private String escape(String str) { - if (StringUtils.isEmpty(str)) { - return str; - } - str = str.replaceAll(SPECIAL_SEPARATOR, ESCAPED_SPECIAL_SEPARATOR); - str = str.replaceAll(DATA_SEPARATOR, ESCAPED_DATA_SEPARATOR); - str = str.replaceAll(KEY_VALUE_SEPARATOR, ESCAPED_KEY_VALUE_SEPARATOR); - return str; - } - - private String unescape(String str) { - if (StringUtils.isEmpty(str)) { - return str; - } - str = str.replaceAll(ESCAPED_KEY_VALUE_SEPARATOR, KEY_VALUE_SEPARATOR); - str = str.replaceAll(ESCAPED_DATA_SEPARATOR, DATA_SEPARATOR); - str = str.replaceAll(ESCAPED_SPECIAL_SEPARATOR, SPECIAL_SEPARATOR); - return str; - } - @Override public String get(Integer key) { if (key == null || key < 0) { return null; } - getCount++; int route = key / BATCH_COUNT; - if (cacheMap.containsKey(route)) { - lastCheckIntervalUsedSet.add(route); - String value = cacheMap.get(route).get(key); - checkClear(); - return value; - } - Map tempCacheMap = new HashMap(BATCH_COUNT / 3 * 4 + 1); - String batchData = cache.get(route); - String[] dataStrings = batchData.split(DATA_SEPARATOR); - for (String dataString : dataStrings) { - String[] keyValue = dataString.split(KEY_VALUE_SEPARATOR); - tempCacheMap.put(Integer.valueOf(keyValue[0]), unescape(keyValue[1])); - } - countList.add(route); - cacheMap.put(route, tempCacheMap); - if (LOGGER.isDebugEnabled()) { - if (cacheMiss++ % DEBUG_CACHE_MISS_SIZE == 0) { - LOGGER.debug("Cache misses count:{}", cacheMiss); - } - } - lastCheckIntervalUsedSet.add(route); - String value = tempCacheMap.get(key); - checkClear(); - return value; - } - - private void checkClear() { - if (countList.size() > MAX_CACHE_ACTIVATE) { - Integer route = countList.getFirst(); - countList.removeFirst(); - cacheMap.remove(route); - } - if (getCount++ % CHECK_INTERVAL != 0) { - return; - } - Iterator>> iterator = cacheMap.entrySet().iterator(); - while (iterator.hasNext()) { - Map.Entry> entry = iterator.next(); - if (lastCheckIntervalUsedSet.contains(entry.getKey())) { - continue; - } - // Last 'CHECK_INTERVAL' not use - iterator.remove(); + HashMap dataMap = activeCache.get(route); + if (dataMap == null) { + dataMap = fileCache.get(route); + activeCache.put(route, dataMap); if (LOGGER.isDebugEnabled()) { - LOGGER.debug("Cache remove because {} times unused.", CHECK_INTERVAL); - } - Iterator countIterator = countList.iterator(); - while (countIterator.hasNext()) { - Integer route = countIterator.next(); - if (route.equals(entry.getKey())) { - countIterator.remove(); - break; + if (cacheMiss++ % DEBUG_CACHE_MISS_SIZE == 0) { + LOGGER.debug("Cache misses count:{}", cacheMiss); } } } - lastCheckIntervalUsedSet.clear(); + return dataMap.get(key); } @Override public void putFinished() { - if (StringUtils.isEmpty(data.toString())) { + if (CollectionUtils.isEmpty(dataMap)) { return; } - cache.put(index / BATCH_COUNT, data.toString()); + fileCache.put(index / BATCH_COUNT, dataMap); } @Override public void destroy() { - cacheManager.close(); + fileCacheManager.removeCache(cacheAlias); + activeCacheManager.removeCache(cacheAlias); } } diff --git a/src/main/java/com/alibaba/excel/cache/selector/EternalReadCacheSelector.java b/src/main/java/com/alibaba/excel/cache/selector/EternalReadCacheSelector.java new file mode 100644 index 00000000..9730dc08 --- /dev/null +++ b/src/main/java/com/alibaba/excel/cache/selector/EternalReadCacheSelector.java @@ -0,0 +1,23 @@ +package com.alibaba.excel.cache.selector; + +import org.apache.poi.openxml4j.opc.PackagePart; + +import com.alibaba.excel.cache.ReadCache; + +/** + * Choose a eternal cache + * + * @author Jiaju Zhuang + **/ +public class EternalReadCacheSelector implements ReadCacheSelector { + private ReadCache readCache; + + public EternalReadCacheSelector(ReadCache readCache) { + this.readCache = readCache; + } + + @Override + public ReadCache readCache(PackagePart sharedStringsTablePackagePart) { + return readCache; + } +} diff --git a/src/main/java/com/alibaba/excel/cache/selector/ReadCacheSelector.java b/src/main/java/com/alibaba/excel/cache/selector/ReadCacheSelector.java new file mode 100644 index 00000000..3a2e5024 --- /dev/null +++ b/src/main/java/com/alibaba/excel/cache/selector/ReadCacheSelector.java @@ -0,0 +1,21 @@ +package com.alibaba.excel.cache.selector; + +import org.apache.poi.openxml4j.opc.PackagePart; + +import com.alibaba.excel.cache.ReadCache; + +/** + * Select the cache + * + * @author Jiaju Zhuang + **/ +public interface ReadCacheSelector { + + /** + * Select a cache + * + * @param sharedStringsTablePackagePart + * @return + */ + ReadCache readCache(PackagePart sharedStringsTablePackagePart); +} diff --git a/src/main/java/com/alibaba/excel/cache/selector/SimpleReadCacheSelector.java b/src/main/java/com/alibaba/excel/cache/selector/SimpleReadCacheSelector.java new file mode 100644 index 00000000..d7cedd07 --- /dev/null +++ b/src/main/java/com/alibaba/excel/cache/selector/SimpleReadCacheSelector.java @@ -0,0 +1,82 @@ +package com.alibaba.excel.cache.selector; + +import java.io.IOException; + +import org.apache.poi.openxml4j.opc.PackagePart; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.alibaba.excel.cache.Ehcache; +import com.alibaba.excel.cache.MapCache; +import com.alibaba.excel.cache.ReadCache; + +/** + * Simple cache selector + * + * @author Jiaju Zhuang + **/ +public class SimpleReadCacheSelector implements ReadCacheSelector { + private static final Logger LOGGER = LoggerFactory.getLogger(SimpleReadCacheSelector.class); + /** + * Convert bytes to megabytes + */ + private static final long B2M = 1000 * 1000L; + /** + * If it's less than 5M, use map cache, or use ehcache.unit MB. + */ + private static final int DEFAULT_MAX_USE_MAP_CACHE_SIZE = 5; + /** + * Maximum size of cache activation.unit MB. + */ + private static final int DEFAULT_MAX_EHCACHE_ACTIVATE_SIZE = 20; + + /** + * Shared strings exceeding this value will use {@link Ehcache},or use {@link MapCache}.unit MB. + */ + private long maxUseMapCacheSize; + + /** + * Maximum size of cache activation.unit MB. + */ + private int maxCacheActivateSize; + + public SimpleReadCacheSelector() { + this(DEFAULT_MAX_USE_MAP_CACHE_SIZE, DEFAULT_MAX_EHCACHE_ACTIVATE_SIZE); + } + + public SimpleReadCacheSelector(long maxUseMapCacheSize, int maxCacheActivateSize) { + if (maxUseMapCacheSize <= 0) { + this.maxUseMapCacheSize = DEFAULT_MAX_USE_MAP_CACHE_SIZE; + } else { + this.maxUseMapCacheSize = maxUseMapCacheSize; + } + if (maxCacheActivateSize <= 0) { + this.maxCacheActivateSize = DEFAULT_MAX_EHCACHE_ACTIVATE_SIZE; + } else { + this.maxCacheActivateSize = maxCacheActivateSize; + } + } + + @Override + public ReadCache readCache(PackagePart sharedStringsTablePackagePart) { + long size = sharedStringsTablePackagePart.getSize(); + if (size < 0) { + try { + size = sharedStringsTablePackagePart.getInputStream().available(); + } catch (IOException e) { + LOGGER.warn("Unable to get file size, default used MapCache"); + return new MapCache(); + } + } + if (size < maxUseMapCacheSize * B2M) { + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Use map cache.size:{}", size); + } + return new MapCache(); + } + if (LOGGER.isDebugEnabled()) { + LOGGER.debug("Use ehcache.size:{}", size); + } + return new Ehcache(maxCacheActivateSize); + } +} diff --git a/src/main/java/com/alibaba/excel/read/builder/ExcelReaderBuilder.java b/src/main/java/com/alibaba/excel/read/builder/ExcelReaderBuilder.java index 15d4a02c..2b5bcd6e 100644 --- a/src/main/java/com/alibaba/excel/read/builder/ExcelReaderBuilder.java +++ b/src/main/java/com/alibaba/excel/read/builder/ExcelReaderBuilder.java @@ -7,6 +7,7 @@ import java.util.List; import com.alibaba.excel.ExcelReader; import com.alibaba.excel.cache.ReadCache; +import com.alibaba.excel.cache.selector.ReadCacheSelector; import com.alibaba.excel.context.AnalysisContext; import com.alibaba.excel.converters.Converter; import com.alibaba.excel.event.AnalysisEventListener; @@ -84,6 +85,17 @@ public class ExcelReaderBuilder { return this; } + /** + * Ignore empty rows.Default is true. + * + * @param ignoreEmptyRow + * @return + */ + public ExcelReaderBuilder ignoreEmptyRow(Boolean ignoreEmptyRow) { + readWorkbook.setIgnoreEmptyRow(ignoreEmptyRow); + return this; + } + /** * This object can be read in the Listener {@link AnalysisEventListener#invoke(Object, AnalysisContext)} * {@link AnalysisContext#getCustom()} @@ -97,7 +109,7 @@ public class ExcelReaderBuilder { } /** - * A cache that stores temp data to save memory.Default use {@link com.alibaba.excel.cache.Ehcache} + * A cache that stores temp data to save memory. * * @param readCache * @return @@ -107,6 +119,17 @@ public class ExcelReaderBuilder { return this; } + /** + * Select the cache.Default use {@link com.alibaba.excel.cache.selector.SimpleReadCacheSelector} + * + * @param readCacheSelector + * @return + */ + public ExcelReaderBuilder readCacheSelector(ReadCacheSelector readCacheSelector) { + readWorkbook.setReadCacheSelector(readCacheSelector); + return this; + } + /** * Count the number of added heads when read sheet. * diff --git a/src/main/java/com/alibaba/excel/read/metadata/ReadWorkbook.java b/src/main/java/com/alibaba/excel/read/metadata/ReadWorkbook.java index 360f941f..b21cdb80 100644 --- a/src/main/java/com/alibaba/excel/read/metadata/ReadWorkbook.java +++ b/src/main/java/com/alibaba/excel/read/metadata/ReadWorkbook.java @@ -4,6 +4,7 @@ import java.io.File; import java.io.InputStream; import com.alibaba.excel.cache.ReadCache; +import com.alibaba.excel.cache.selector.ReadCacheSelector; import com.alibaba.excel.context.AnalysisContext; import com.alibaba.excel.event.AnalysisEventListener; import com.alibaba.excel.support.ExcelTypeEnum; @@ -47,9 +48,17 @@ public class ReadWorkbook extends ReadBasicParameter { */ private Object customObject; /** - * A cache that stores temp data to save memory.Default use {@link com.alibaba.excel.cache.Ehcache} + * A cache that stores temp data to save memory. */ private ReadCache readCache; + /** + * Ignore empty rows.Default is true. + */ + private Boolean ignoreEmptyRow; + /** + * Select the cache.Default use {@link com.alibaba.excel.cache.selector.SimpleReadCacheSelector} + */ + private ReadCacheSelector readCacheSelector; /** * The default is all excel objects.Default is true. *

@@ -139,4 +148,20 @@ public class ReadWorkbook extends ReadBasicParameter { public void setDefaultReturnMap(Boolean defaultReturnMap) { this.defaultReturnMap = defaultReturnMap; } + + public Boolean getIgnoreEmptyRow() { + return ignoreEmptyRow; + } + + public void setIgnoreEmptyRow(Boolean ignoreEmptyRow) { + this.ignoreEmptyRow = ignoreEmptyRow; + } + + public ReadCacheSelector getReadCacheSelector() { + return readCacheSelector; + } + + public void setReadCacheSelector(ReadCacheSelector readCacheSelector) { + this.readCacheSelector = readCacheSelector; + } } diff --git a/src/main/java/com/alibaba/excel/read/metadata/holder/AbstractReadHolder.java b/src/main/java/com/alibaba/excel/read/metadata/holder/AbstractReadHolder.java index f9dd0d66..4ca8d63c 100644 --- a/src/main/java/com/alibaba/excel/read/metadata/holder/AbstractReadHolder.java +++ b/src/main/java/com/alibaba/excel/read/metadata/holder/AbstractReadHolder.java @@ -5,17 +5,18 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.alibaba.excel.context.AnalysisContext; import com.alibaba.excel.converters.Converter; import com.alibaba.excel.converters.ConverterKeyBuild; import com.alibaba.excel.converters.DefaultConverterLoader; -import com.alibaba.excel.enums.CellDataTypeEnum; import com.alibaba.excel.enums.HeadKindEnum; import com.alibaba.excel.enums.HolderEnum; import com.alibaba.excel.event.AnalysisEventListener; import com.alibaba.excel.exception.ExcelAnalysisException; import com.alibaba.excel.exception.ExcelAnalysisStopException; -import com.alibaba.excel.exception.ExcelDataConvertException; import com.alibaba.excel.metadata.AbstractHolder; import com.alibaba.excel.metadata.CellData; import com.alibaba.excel.metadata.Head; @@ -26,6 +27,7 @@ import com.alibaba.excel.read.listener.ReadListenerRegistryCenter; import com.alibaba.excel.read.listener.event.AnalysisFinishEvent; import com.alibaba.excel.read.metadata.ReadBasicParameter; import com.alibaba.excel.read.metadata.property.ExcelReadHeadProperty; +import com.alibaba.excel.util.CollectionUtils; import com.alibaba.excel.util.ConverterUtils; import com.alibaba.excel.util.StringUtils; @@ -35,6 +37,8 @@ import com.alibaba.excel.util.StringUtils; * @author Jiaju Zhuang */ public abstract class AbstractReadHolder extends AbstractHolder implements ReadHolder, ReadListenerRegistryCenter { + private static final Logger LOGGER = LoggerFactory.getLogger(AbstractReadHolder.class); + /** * Count the number of added heads when read sheet. * @@ -117,6 +121,14 @@ public abstract class AbstractReadHolder extends AbstractHolder implements ReadH @Override public void notifyEndOneRow(AnalysisFinishEvent event, AnalysisContext analysisContext) { Map cellDataMap = event.getAnalysisResult(); + if (CollectionUtils.isEmpty(cellDataMap)) { + if (LOGGER.isDebugEnabled()) { + LOGGER.warn("Empty row!"); + } + if (analysisContext.readWorkbookHolder().getIgnoreEmptyRow()) { + return; + } + } ReadRowHolder readRowHolder = analysisContext.readRowHolder(); readRowHolder.setCurrentRowAnalysisResult(cellDataMap); int rowIndex = readRowHolder.getRowIndex(); diff --git a/src/main/java/com/alibaba/excel/read/metadata/holder/ReadWorkbookHolder.java b/src/main/java/com/alibaba/excel/read/metadata/holder/ReadWorkbookHolder.java index 815ef06e..3fd88395 100644 --- a/src/main/java/com/alibaba/excel/read/metadata/holder/ReadWorkbookHolder.java +++ b/src/main/java/com/alibaba/excel/read/metadata/holder/ReadWorkbookHolder.java @@ -12,6 +12,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.alibaba.excel.cache.ReadCache; +import com.alibaba.excel.cache.selector.EternalReadCacheSelector; +import com.alibaba.excel.cache.selector.ReadCacheSelector; +import com.alibaba.excel.cache.selector.SimpleReadCacheSelector; import com.alibaba.excel.context.AnalysisContext; import com.alibaba.excel.enums.HolderEnum; import com.alibaba.excel.event.AnalysisEventListener; @@ -64,10 +67,17 @@ public class ReadWorkbookHolder extends AbstractReadHolder { */ private Object customObject; /** - * A cache that stores temp data to save memory.Default use {@link com.alibaba.excel.cache.Ehcache} + * Ignore empty rows.Default is true. + */ + private Boolean ignoreEmptyRow; + /** + * A cache that stores temp data to save memory. */ private ReadCache readCache; - + /** + * Select the cache.Default use {@link com.alibaba.excel.cache.selector.SimpleReadCacheSelector} + */ + private ReadCacheSelector readCacheSelector; /** * Temporary files when reading excel */ @@ -140,9 +150,22 @@ public class ReadWorkbookHolder extends AbstractReadHolder { getGlobalConfiguration().setUse1904windowing(Boolean.FALSE); } this.customObject = readWorkbook.getCustomObject(); - this.readCache = readWorkbook.getReadCache(); - if (readCache != null && ExcelTypeEnum.XLS == excelType) { - LOGGER.warn("Xls not support 'readCache'!"); + if (readWorkbook.getIgnoreEmptyRow() == null) { + this.ignoreEmptyRow = Boolean.TRUE; + } else { + this.ignoreEmptyRow = readWorkbook.getIgnoreEmptyRow(); + } + if (readWorkbook.getReadCache() != null) { + if (readWorkbook.getReadCacheSelector() != null) { + throw new ExcelAnalysisException("'readCache' and 'readCacheSelector' only one choice."); + } + this.readCacheSelector = new EternalReadCacheSelector(readWorkbook.getReadCache()); + } else { + if (readWorkbook.getReadCacheSelector() == null) { + this.readCacheSelector = new SimpleReadCacheSelector(); + } else { + this.readCacheSelector = readWorkbook.getReadCacheSelector(); + } } if (readWorkbook.getDefaultReturnMap() == null) { this.defaultReturnMap = Boolean.TRUE; @@ -201,6 +224,14 @@ public class ReadWorkbookHolder extends AbstractReadHolder { this.customObject = customObject; } + public Boolean getIgnoreEmptyRow() { + return ignoreEmptyRow; + } + + public void setIgnoreEmptyRow(Boolean ignoreEmptyRow) { + this.ignoreEmptyRow = ignoreEmptyRow; + } + public ReadCache getReadCache() { return readCache; } @@ -209,6 +240,14 @@ public class ReadWorkbookHolder extends AbstractReadHolder { this.readCache = readCache; } + public ReadCacheSelector getReadCacheSelector() { + return readCacheSelector; + } + + public void setReadCacheSelector(ReadCacheSelector readCacheSelector) { + this.readCacheSelector = readCacheSelector; + } + public Boolean getMandatoryUseInputStream() { return mandatoryUseInputStream; } diff --git a/src/main/java/com/alibaba/excel/write/metadata/holder/WriteWorkbookHolder.java b/src/main/java/com/alibaba/excel/write/metadata/holder/WriteWorkbookHolder.java index fe23b2a1..fe5c395d 100644 --- a/src/main/java/com/alibaba/excel/write/metadata/holder/WriteWorkbookHolder.java +++ b/src/main/java/com/alibaba/excel/write/metadata/holder/WriteWorkbookHolder.java @@ -10,8 +10,6 @@ import java.util.HashMap; import java.util.Map; import org.apache.poi.ss.usermodel.Workbook; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import com.alibaba.excel.enums.HolderEnum; import com.alibaba.excel.exception.ExcelGenerateException; diff --git a/src/test/java/com/alibaba/easyexcel/test/temp/cache/CacheTest.java b/src/test/java/com/alibaba/easyexcel/test/temp/cache/CacheTest.java new file mode 100644 index 00000000..4ba065a3 --- /dev/null +++ b/src/test/java/com/alibaba/easyexcel/test/temp/cache/CacheTest.java @@ -0,0 +1,62 @@ +package com.alibaba.easyexcel.test.temp.cache; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +import org.apache.poi.xssf.streaming.SXSSFRow; +import org.apache.poi.xssf.streaming.SXSSFSheet; +import org.apache.poi.xssf.streaming.SXSSFWorkbook; +import org.apache.poi.xssf.usermodel.XSSFRow; +import org.apache.poi.xssf.usermodel.XSSFSheet; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.ehcache.Cache; +import org.ehcache.PersistentCacheManager; +import org.ehcache.config.builders.CacheConfigurationBuilder; +import org.ehcache.config.builders.CacheManagerBuilder; +import org.ehcache.config.builders.ResourcePoolsBuilder; +import org.ehcache.config.units.MemoryUnit; +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.alibaba.easyexcel.test.temp.poi.Poi2Test; +import com.alibaba.excel.util.FileUtils; +import com.alibaba.fastjson.JSON; + +/** + * + * @author Jiaju Zhuang + **/ +@Ignore +public class CacheTest { + private static final Logger LOGGER = LoggerFactory.getLogger(Poi2Test.class); + + @Test + public void cache() throws Exception { + + File readTempFile = FileUtils.createCacheTmpFile(); + + File cacheFile = new File(readTempFile.getPath(), UUID.randomUUID().toString()); + PersistentCacheManager persistentCacheManager = + CacheManagerBuilder.newCacheManagerBuilder().with(CacheManagerBuilder.persistence(cacheFile)) + .withCache("cache", CacheConfigurationBuilder.newCacheConfigurationBuilder(Integer.class, HashMap.class, + ResourcePoolsBuilder.newResourcePoolsBuilder().disk(10, MemoryUnit.GB))) + .build(true); + Cache cache = persistentCacheManager.getCache("cache", Integer.class, HashMap.class); + + HashMap map = new HashMap(); + map.put(1, "test"); + + cache.put(1, map); + LOGGER.info("dd1:{}", JSON.toJSONString(cache.get(1))); + + cache.clear(); + + LOGGER.info("dd2:{}", JSON.toJSONString(cache.get(1))); + } + +} diff --git a/src/test/java/com/alibaba/easyexcel/test/temp/large/LargeData.java b/src/test/java/com/alibaba/easyexcel/test/temp/large/LargeData.java new file mode 100644 index 00000000..1424c426 --- /dev/null +++ b/src/test/java/com/alibaba/easyexcel/test/temp/large/LargeData.java @@ -0,0 +1,60 @@ +package com.alibaba.easyexcel.test.temp.large; + +import lombok.Data; + +/** + * @author Jiaju Zhuang + */ +@Data +public class LargeData { + + private String str1; + + private String str2; + + private String str3; + + private String str4; + + private String str5; + + private String str6; + + private String str7; + + private String str8; + + private String str9; + + private String str10; + + private String str11; + + private String str12; + + private String str13; + + private String str14; + + private String str15; + + private String str16; + + private String str17; + + private String str18; + + private String str19; + + private String str20; + + private String str21; + + private String str22; + + private String str23; + + private String str24; + + private String str25; +} diff --git a/src/test/java/com/alibaba/easyexcel/test/temp/large/LargeDataListener.java b/src/test/java/com/alibaba/easyexcel/test/temp/large/LargeDataListener.java new file mode 100644 index 00000000..a5b65565 --- /dev/null +++ b/src/test/java/com/alibaba/easyexcel/test/temp/large/LargeDataListener.java @@ -0,0 +1,33 @@ +package com.alibaba.easyexcel.test.temp.large; + +import org.junit.Assert; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.alibaba.excel.context.AnalysisContext; +import com.alibaba.excel.event.AnalysisEventListener; +import com.alibaba.fastjson.JSON; + +/** + * @author Jiaju Zhuang + */ +public class LargeDataListener extends AnalysisEventListener { + private static final Logger LOGGER = LoggerFactory.getLogger(LargeDataListener.class); + private int count = 0; + + @Override + public void invoke(LargeData data, AnalysisContext context) { + if (count == 0) { + LOGGER.info("First row:{}", JSON.toJSONString(data)); + } + count++; + if (count % 100000 == 0) { + LOGGER.info("Already read:{}", count); + } + } + + @Override + public void doAfterAllAnalysed(AnalysisContext context) { + LOGGER.info("Large row count:{}", count); + } +} diff --git a/src/test/java/com/alibaba/easyexcel/test/temp/large/TempLargeDataTest.java b/src/test/java/com/alibaba/easyexcel/test/temp/large/TempLargeDataTest.java new file mode 100644 index 00000000..6d142333 --- /dev/null +++ b/src/test/java/com/alibaba/easyexcel/test/temp/large/TempLargeDataTest.java @@ -0,0 +1,28 @@ +package com.alibaba.easyexcel.test.temp.large; + +import java.io.FileInputStream; + +import org.junit.Ignore; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.alibaba.easyexcel.test.core.large.LargeDataTest; +import com.alibaba.excel.EasyExcel; + +/** + * + * @author Jiaju Zhuang + */ +@Ignore +public class TempLargeDataTest { + private static final Logger LOGGER = LoggerFactory.getLogger(LargeDataTest.class); + + @Test + public void read() throws Exception { + long start = System.currentTimeMillis(); + EasyExcel.read(new FileInputStream("D:\\test\\MRP生产视图(1).xlsx"), LargeData.class, new LargeDataListener()) + .headRowNumber(2).sheet().doRead(); + LOGGER.info("Large data total time spent:{}", System.currentTimeMillis() - start); + } +} diff --git a/src/test/java/com/alibaba/easyexcel/test/temp/simple/HgTest.java b/src/test/java/com/alibaba/easyexcel/test/temp/simple/HgTest.java index eed6f92e..9b0d5402 100644 --- a/src/test/java/com/alibaba/easyexcel/test/temp/simple/HgTest.java +++ b/src/test/java/com/alibaba/easyexcel/test/temp/simple/HgTest.java @@ -24,7 +24,7 @@ public class HgTest { @Test public void hh() throws IOException { List list = - EasyExcel.read(new FileInputStream("D:\\test\\1.xls")).headRowNumber(0).sheet().doReadSync(); + EasyExcel.read(new FileInputStream("D:\\test\\testempty.xls")).headRowNumber(0).sheet().doReadSync(); for (Object data : list) { LOGGER.info("返回数据:{}", JSON.toJSONString(data)); } @@ -32,8 +32,8 @@ public class HgTest { @Test public void hh2() throws IOException { - EasyExcel.read(new FileInputStream("D:\\test\\商户不匹配工单信息收集表格.xlsx")) - .registerReadListener(new HgListener()).headRowNumber(0).sheet().doRead(); + EasyExcel.read(new FileInputStream("D:\\test\\商户不匹配工单信息收集表格.xlsx")).registerReadListener(new HgListener()) + .headRowNumber(0).sheet().doRead(); } } diff --git a/update.md b/update.md index 9d91b9f1..769f0955 100644 --- a/update.md +++ b/update.md @@ -1,3 +1,9 @@ +# 2.0.5 +* 优化07版超大文件读取方案 +* 支持自己设置超大文件读取参数 +* 读取xlsx会改变修改时间的bug [Issue #574](https://github.com/alibaba/easyexcel/issues/574) +* 默认读取忽略空行 根据参数ignoreEmptyRow参数设置 + # 2.0.4 * 修复07版整个excel仅存在数字时会出现的NPE * 修复03版 用String接收电话会出现科学计数法的问题