diff --git a/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser2.java b/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser2.java new file mode 100644 index 000000000..fc9ab4f72 --- /dev/null +++ b/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser2.java @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2013-2024 Hutool Team and hutool.cn + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dromara.hutool.poi.csv; + +import org.dromara.hutool.core.collection.iter.ComputeIter; +import org.dromara.hutool.core.io.IORuntimeException; +import org.dromara.hutool.core.map.MapUtil; +import org.dromara.hutool.core.text.CharUtil; +import org.dromara.hutool.core.text.StrTrimer; +import org.dromara.hutool.core.text.StrUtil; +import org.dromara.hutool.core.util.ObjUtil; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Reader; +import java.io.Serializable; +import java.util.*; + +/** + * CSV行解析器,参考:FastCSV + * + * @author Looly + */ +public final class CsvParser2 extends ComputeIter implements Closeable, Serializable { + private static final long serialVersionUID = 1L; + + private static final int DEFAULT_ROW_CAPACITY = 10; + + private final CsvReadConfig config; + private final CsvTokener tokener; + /** + * 前一个特殊分界字符 + */ + private int preChar = -1; + /** + * 是否在引号包装内 + */ + private boolean inQuotes; + /** + * 当前读取字段 + */ + private final StringBuilder currentField = new StringBuilder(512); + + /** + * 标题行 + */ + private CsvRow header; + /** + * 当前行号 + */ + private long lineNo = -1; + /** + * 引号内的行数 + */ + private long inQuotesLineCount; + /** + * 第一行字段数,用于检查每行字段数是否一致 + */ + private int firstLineFieldCount = -1; + /** + * 最大字段数量,用于初始化行,减少扩容 + */ + private int maxFieldCount; + /** + * 是否读取结束 + */ + private boolean finished; + + /** + * CSV解析器 + * + * @param reader Reader + * @param config 配置,null则为默认配置 + */ + public CsvParser2(final Reader reader, final CsvReadConfig config) { + this.config = ObjUtil.defaultIfNull(config, CsvReadConfig::defaultConfig); + this.tokener = new CsvTokener(reader); + } + + /** + * 获取头部字段列表,如果headerLineNo < 0,抛出异常 + * + * @return 头部列表 + * @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法 + */ + public List getHeader() { + if (config.headerLineNo < 0) { + throw new IllegalStateException("No header available - header parsing is disabled"); + } + if (lineNo < config.beginLineNo) { + throw new IllegalStateException("No header available - call nextRow() first"); + } + return header.getRaw(); + } + + @Override + protected CsvRow computeNext() { + return nextRow(); + } + + /** + * 读取下一行数据 + * + * @return CsvRow + * @throws IORuntimeException IO读取异常 + */ + public CsvRow nextRow() throws IORuntimeException { + List currentFields; + int fieldCount; + while (!finished) { + currentFields = readLine(); + fieldCount = currentFields.size(); + if (fieldCount < 1) { + // 空List表示读取结束 + break; + } + + // 读取范围校验 + if (lineNo < config.beginLineNo) { + // 未达到读取起始行,继续 + continue; + } + if (lineNo > config.endLineNo) { + // 超出结束行,读取结束 + break; + } + + // 跳过空行 + if (config.skipEmptyRows && fieldCount == 1 && currentFields.get(0).isEmpty()) { + // [""]表示空行 + continue; + } + + // 检查每行的字段数是否一致 + if (config.errorOnDifferentFieldCount) { + if (firstLineFieldCount < 0) { + firstLineFieldCount = fieldCount; + } else if (fieldCount != firstLineFieldCount) { + throw new IORuntimeException(String.format("Line %d has %d fields, but first line has %d fields", lineNo, fieldCount, firstLineFieldCount)); + } + } + + // 记录最大字段数 + if (fieldCount > maxFieldCount) { + maxFieldCount = fieldCount; + } + + //初始化标题 + if (lineNo == config.headerLineNo && null == header) { + initHeader(currentFields); + // 作为标题行后,此行跳过,下一行做为第一行 + continue; + } + + return new CsvRow(lineNo, null == header ? null : header.headerMap, currentFields); + } + + return null; + } + + /** + * 当前行做为标题行 + * + * @param currentFields 当前行字段列表 + */ + private void initHeader(final List currentFields) { + final Map localHeaderMap = new LinkedHashMap<>(currentFields.size()); + for (int i = 0; i < currentFields.size(); i++) { + String field = currentFields.get(i); + if (MapUtil.isNotEmpty(this.config.headerAlias)) { + // 自定义别名 + field = ObjUtil.defaultIfNull(this.config.headerAlias.get(field), field); + } + if (StrUtil.isNotEmpty(field) && !localHeaderMap.containsKey(field)) { + localHeaderMap.put(field, i); + } + } + + header = new CsvRow(this.lineNo, Collections.unmodifiableMap(localHeaderMap), Collections.unmodifiableList(currentFields)); + } + + /** + * 读取一行数据,如果读取结束,返回size为0的List
+ * 空行是size为1的List,唯一元素是"" + * + *

+ * 行号要考虑注释行和引号包装的内容中的换行 + *

+ * + * @return 一行数据 + * @throws IORuntimeException IO异常 + */ + private List readLine() throws IORuntimeException { + // 矫正行号 + // 当一行内容包含多行数据时,记录首行行号,但是读取下一行时,需要把多行内容的行数加上 + if (inQuotesLineCount > 0) { + this.lineNo += this.inQuotesLineCount; + this.inQuotesLineCount = 0; + } + + final List currentFields = new ArrayList<>(maxFieldCount > 0 ? maxFieldCount : DEFAULT_ROW_CAPACITY); + + final StringBuilder currentField = this.currentField; + int preChar = this.preChar;//前一个特殊分界字符 + boolean inComment = false; + + int c; + while (true) { + c = tokener.next(); + if(c < 0){ + // 读取结束 + this.finished = true; + break; + } + + // 注释行标记 + if (preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF) { + // 判断行首字符为指定注释字符的注释开始,直到遇到换行符 + // 行首分两种,1是preChar < 0表示文本开始,2是换行符后紧跟就是下一行的开始 + // issue#IA8WE0 如果注释符出现在包装符内,被认为是普通字符 + if (!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter) { + inComment = true; + } + } + // 注释行处理 + if (inComment) { + if (c == CharUtil.CR || c == CharUtil.LF) { + // 注释行以换行符为结尾 + lineNo++; + inComment = false; + } + // 跳过注释行中的任何字符 + continue; + } + + if (inQuotes) { + //引号内,作为内容,直到引号结束 + if (c == config.textDelimiter) { + // End of quoted text + inQuotes = false; + } else { + // 字段内容中新行 + if (isLineEnd(c, preChar)) { + inQuotesLineCount++; + } + } + // 普通字段字符 + currentField.append((char)c); + } else { + // 非引号内 + if (c == config.fieldSeparator) { + //一个字段结束 + addField(currentFields, currentField.toString()); + currentField.setLength(0); + } else if (c == config.textDelimiter && isFieldBegin(preChar)) { + // 引号开始且出现在字段开头 + inQuotes = true; + currentField.append((char)c); + } else if (c == CharUtil.CR) { + // \r + addField(currentFields, currentField.toString()); + currentField.setLength(0); + preChar = c; + break; + } else if (c == CharUtil.LF) { + // \n + if (preChar != CharUtil.CR) { + addField(currentFields, currentField.toString()); + currentField.setLength(0); + preChar = c; + break; + } + // 前一个字符是\r,已经处理过这个字段了,此处直接跳过 + } else { + currentField.append((char)c); + } + } + + preChar = c; + } + + // restore fields + this.preChar = preChar; + + lineNo++; + return currentFields; + } + + @Override + public void close() throws IOException { + tokener.close(); + } + + /** + * 将字段加入字段列表并自动去包装和去转义 + * + * @param currentFields 当前的字段列表(即为行) + * @param field 字段 + */ + private void addField(final List currentFields, String field) { + final char textDelimiter = this.config.textDelimiter; + + // 忽略多余引号后的换行符 + field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c -> c == CharUtil.LF || c == CharUtil.CR)); + + if(StrUtil.isWrap(field, textDelimiter)){ + field = StrUtil.sub(field, 1, field.length() - 1); + // https://datatracker.ietf.org/doc/html/rfc4180#section-2 + // 第七条规则,只有包装内的包装符需要转义 + field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter)); + } + if (this.config.trimField) { + // issue#I49M0C@Gitee + field = StrUtil.trim(field); + } + currentFields.add(field); + } + + /** + * 是否行结束符 + * + * @param c 符号 + * @param preChar 前一个字符 + * @return 是否结束 + * @since 5.7.4 + */ + private boolean isLineEnd(final int c, final int preChar) { + return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR; + } + + /** + * 通过前一个字符,判断是否字段开始,几种情况: + *
    + *
  • 正文开头,无前字符
  • + *
  • 字段分隔符,即上个字段结束
  • + *
  • 换行符,即新行开始
  • + *
+ * + * @param preChar 前字符 + * @return 是否字段开始 + */ + private boolean isFieldBegin(final int preChar) { + return preChar == -1 + || preChar == config.fieldSeparator + || preChar == CharUtil.LF + || preChar == CharUtil.CR; + } +} diff --git a/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvTokener.java b/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvTokener.java new file mode 100644 index 000000000..868a75b29 --- /dev/null +++ b/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvTokener.java @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024 Hutool Team and hutool.cn + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.dromara.hutool.poi.csv; + +import org.dromara.hutool.core.io.IORuntimeException; +import org.dromara.hutool.core.io.IoUtil; +import org.dromara.hutool.core.lang.wrapper.SimpleWrapper; + +import java.io.Closeable; +import java.io.IOException; +import java.io.Reader; + +public class CsvTokener extends SimpleWrapper implements Closeable { + + /** + * 在Reader的位置(解析到第几个字符) + */ + private long index; + /** + * 前一个字符 + */ + private int prev; + /** + * 是否使用前一个字符 + */ + private boolean usePrev; + + /** + * 构造 + * + * @param reader {@link Reader} + */ + public CsvTokener(final Reader reader) { + super(reader); + } + + /** + * 读取下一个字符,并记录位置 + * + * @return 下一个字符 + */ + public int next() { + if(this.usePrev){ + this.usePrev = false; + return this.prev; + } + try { + this.prev = this.raw.read(); + } catch (final IOException e) { + throw new IORuntimeException(e); + } + this.index++; + return this.prev; + } + + /** + * 将标记回退到第一个字符 + * + * @throws IllegalStateException 当多次调用back时,抛出此异常 + */ + public void back() throws IllegalStateException { + if (this.usePrev || this.index <= 0) { + throw new IllegalStateException("Stepping back two steps is not supported"); + } + this.index --; + this.usePrev = true; + } + + @Override + public void close() throws IOException { + IoUtil.nullSafeClose(this.raw); + } +} diff --git a/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/CsvParserTest.java b/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/CsvParserTest.java index b9ff12371..83bab13fb 100644 --- a/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/CsvParserTest.java +++ b/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/CsvParserTest.java @@ -29,7 +29,7 @@ public class CsvParserTest { @Test public void parseTest1() { final StringReader reader = StrUtil.getReader("aaa,b\"bba\",ccc"); - final CsvParser parser = new CsvParser(reader, null); + final CsvParser2 parser = new CsvParser2(reader, null); final CsvRow row = parser.nextRow(); //noinspection ConstantConditions Assertions.assertEquals("b\"bba\"", row.getRaw().get(1)); @@ -39,7 +39,7 @@ public class CsvParserTest { @Test public void parseTest2() { final StringReader reader = StrUtil.getReader("aaa,\"bba\"bbb,ccc"); - final CsvParser parser = new CsvParser(reader, null); + final CsvParser2 parser = new CsvParser2(reader, null); final CsvRow row = parser.nextRow(); //noinspection ConstantConditions Assertions.assertEquals("\"bba\"bbb", row.getRaw().get(1)); @@ -49,7 +49,7 @@ public class CsvParserTest { @Test public void parseTest3() { final StringReader reader = StrUtil.getReader("aaa,\"bba\",ccc"); - final CsvParser parser = new CsvParser(reader, null); + final CsvParser2 parser = new CsvParser2(reader, null); final CsvRow row = parser.nextRow(); //noinspection ConstantConditions Assertions.assertEquals("bba", row.getRaw().get(1)); @@ -59,7 +59,7 @@ public class CsvParserTest { @Test public void parseTest4() { final StringReader reader = StrUtil.getReader("aaa,\"\",ccc"); - final CsvParser parser = new CsvParser(reader, null); + final CsvParser2 parser = new CsvParser2(reader, null); final CsvRow row = parser.nextRow(); //noinspection ConstantConditions Assertions.assertEquals("", row.getRaw().get(1));