add tmp CsvParser2

This commit is contained in:
Looly 2024-11-22 00:32:26 +08:00
parent d05b6fd911
commit 755aed01de
3 changed files with 453 additions and 4 deletions

View File

@ -0,0 +1,362 @@
/*
* Copyright (c) 2013-2024 Hutool Team and hutool.cn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dromara.hutool.poi.csv;
import org.dromara.hutool.core.collection.iter.ComputeIter;
import org.dromara.hutool.core.io.IORuntimeException;
import org.dromara.hutool.core.map.MapUtil;
import org.dromara.hutool.core.text.CharUtil;
import org.dromara.hutool.core.text.StrTrimer;
import org.dromara.hutool.core.text.StrUtil;
import org.dromara.hutool.core.util.ObjUtil;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.*;
/**
* CSV行解析器参考FastCSV
*
* @author Looly
*/
public final class CsvParser2 extends ComputeIter<CsvRow> implements Closeable, Serializable {
private static final long serialVersionUID = 1L;
private static final int DEFAULT_ROW_CAPACITY = 10;
private final CsvReadConfig config;
private final CsvTokener tokener;
/**
* 前一个特殊分界字符
*/
private int preChar = -1;
/**
* 是否在引号包装内
*/
private boolean inQuotes;
/**
* 当前读取字段
*/
private final StringBuilder currentField = new StringBuilder(512);
/**
* 标题行
*/
private CsvRow header;
/**
* 当前行号
*/
private long lineNo = -1;
/**
* 引号内的行数
*/
private long inQuotesLineCount;
/**
* 第一行字段数用于检查每行字段数是否一致
*/
private int firstLineFieldCount = -1;
/**
* 最大字段数量用于初始化行减少扩容
*/
private int maxFieldCount;
/**
* 是否读取结束
*/
private boolean finished;
/**
* CSV解析器
*
* @param reader Reader
* @param config 配置null则为默认配置
*/
public CsvParser2(final Reader reader, final CsvReadConfig config) {
this.config = ObjUtil.defaultIfNull(config, CsvReadConfig::defaultConfig);
this.tokener = new CsvTokener(reader);
}
/**
* 获取头部字段列表如果headerLineNo &lt; 0抛出异常
*
* @return 头部列表
* @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法
*/
public List<String> getHeader() {
if (config.headerLineNo < 0) {
throw new IllegalStateException("No header available - header parsing is disabled");
}
if (lineNo < config.beginLineNo) {
throw new IllegalStateException("No header available - call nextRow() first");
}
return header.getRaw();
}
@Override
protected CsvRow computeNext() {
return nextRow();
}
/**
* 读取下一行数据
*
* @return CsvRow
* @throws IORuntimeException IO读取异常
*/
public CsvRow nextRow() throws IORuntimeException {
List<String> currentFields;
int fieldCount;
while (!finished) {
currentFields = readLine();
fieldCount = currentFields.size();
if (fieldCount < 1) {
// 空List表示读取结束
break;
}
// 读取范围校验
if (lineNo < config.beginLineNo) {
// 未达到读取起始行继续
continue;
}
if (lineNo > config.endLineNo) {
// 超出结束行读取结束
break;
}
// 跳过空行
if (config.skipEmptyRows && fieldCount == 1 && currentFields.get(0).isEmpty()) {
// [""]表示空行
continue;
}
// 检查每行的字段数是否一致
if (config.errorOnDifferentFieldCount) {
if (firstLineFieldCount < 0) {
firstLineFieldCount = fieldCount;
} else if (fieldCount != firstLineFieldCount) {
throw new IORuntimeException(String.format("Line %d has %d fields, but first line has %d fields", lineNo, fieldCount, firstLineFieldCount));
}
}
// 记录最大字段数
if (fieldCount > maxFieldCount) {
maxFieldCount = fieldCount;
}
//初始化标题
if (lineNo == config.headerLineNo && null == header) {
initHeader(currentFields);
// 作为标题行后此行跳过下一行做为第一行
continue;
}
return new CsvRow(lineNo, null == header ? null : header.headerMap, currentFields);
}
return null;
}
/**
* 当前行做为标题行
*
* @param currentFields 当前行字段列表
*/
private void initHeader(final List<String> currentFields) {
final Map<String, Integer> localHeaderMap = new LinkedHashMap<>(currentFields.size());
for (int i = 0; i < currentFields.size(); i++) {
String field = currentFields.get(i);
if (MapUtil.isNotEmpty(this.config.headerAlias)) {
// 自定义别名
field = ObjUtil.defaultIfNull(this.config.headerAlias.get(field), field);
}
if (StrUtil.isNotEmpty(field) && !localHeaderMap.containsKey(field)) {
localHeaderMap.put(field, i);
}
}
header = new CsvRow(this.lineNo, Collections.unmodifiableMap(localHeaderMap), Collections.unmodifiableList(currentFields));
}
/**
* 读取一行数据如果读取结束返回size为0的List<br>
* 空行是size为1的List唯一元素是""
*
* <p>
* 行号要考虑注释行和引号包装的内容中的换行
* </p>
*
* @return 一行数据
* @throws IORuntimeException IO异常
*/
private List<String> readLine() throws IORuntimeException {
// 矫正行号
// 当一行内容包含多行数据时记录首行行号但是读取下一行时需要把多行内容的行数加上
if (inQuotesLineCount > 0) {
this.lineNo += this.inQuotesLineCount;
this.inQuotesLineCount = 0;
}
final List<String> currentFields = new ArrayList<>(maxFieldCount > 0 ? maxFieldCount : DEFAULT_ROW_CAPACITY);
final StringBuilder currentField = this.currentField;
int preChar = this.preChar;//前一个特殊分界字符
boolean inComment = false;
int c;
while (true) {
c = tokener.next();
if(c < 0){
// 读取结束
this.finished = true;
break;
}
// 注释行标记
if (preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF) {
// 判断行首字符为指定注释字符的注释开始直到遇到换行符
// 行首分两种1是preChar < 0表示文本开始2是换行符后紧跟就是下一行的开始
// issue#IA8WE0 如果注释符出现在包装符内被认为是普通字符
if (!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter) {
inComment = true;
}
}
// 注释行处理
if (inComment) {
if (c == CharUtil.CR || c == CharUtil.LF) {
// 注释行以换行符为结尾
lineNo++;
inComment = false;
}
// 跳过注释行中的任何字符
continue;
}
if (inQuotes) {
//引号内作为内容直到引号结束
if (c == config.textDelimiter) {
// End of quoted text
inQuotes = false;
} else {
// 字段内容中新行
if (isLineEnd(c, preChar)) {
inQuotesLineCount++;
}
}
// 普通字段字符
currentField.append((char)c);
} else {
// 非引号内
if (c == config.fieldSeparator) {
//一个字段结束
addField(currentFields, currentField.toString());
currentField.setLength(0);
} else if (c == config.textDelimiter && isFieldBegin(preChar)) {
// 引号开始且出现在字段开头
inQuotes = true;
currentField.append((char)c);
} else if (c == CharUtil.CR) {
// \r
addField(currentFields, currentField.toString());
currentField.setLength(0);
preChar = c;
break;
} else if (c == CharUtil.LF) {
// \n
if (preChar != CharUtil.CR) {
addField(currentFields, currentField.toString());
currentField.setLength(0);
preChar = c;
break;
}
// 前一个字符是\r已经处理过这个字段了此处直接跳过
} else {
currentField.append((char)c);
}
}
preChar = c;
}
// restore fields
this.preChar = preChar;
lineNo++;
return currentFields;
}
@Override
public void close() throws IOException {
tokener.close();
}
/**
* 将字段加入字段列表并自动去包装和去转义
*
* @param currentFields 当前的字段列表即为行
* @param field 字段
*/
private void addField(final List<String> currentFields, String field) {
final char textDelimiter = this.config.textDelimiter;
// 忽略多余引号后的换行符
field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c -> c == CharUtil.LF || c == CharUtil.CR));
if(StrUtil.isWrap(field, textDelimiter)){
field = StrUtil.sub(field, 1, field.length() - 1);
// https://datatracker.ietf.org/doc/html/rfc4180#section-2
// 第七条规则只有包装内的包装符需要转义
field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter));
}
if (this.config.trimField) {
// issue#I49M0C@Gitee
field = StrUtil.trim(field);
}
currentFields.add(field);
}
/**
* 是否行结束符
*
* @param c 符号
* @param preChar 前一个字符
* @return 是否结束
* @since 5.7.4
*/
private boolean isLineEnd(final int c, final int preChar) {
return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR;
}
/**
* 通过前一个字符判断是否字段开始几种情况
* <ul>
* <li>正文开头无前字符</li>
* <li>字段分隔符即上个字段结束</li>
* <li>换行符即新行开始</li>
* </ul>
*
* @param preChar 前字符
* @return 是否字段开始
*/
private boolean isFieldBegin(final int preChar) {
return preChar == -1
|| preChar == config.fieldSeparator
|| preChar == CharUtil.LF
|| preChar == CharUtil.CR;
}
}

View File

@ -0,0 +1,87 @@
/*
* Copyright (c) 2024 Hutool Team and hutool.cn
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dromara.hutool.poi.csv;
import org.dromara.hutool.core.io.IORuntimeException;
import org.dromara.hutool.core.io.IoUtil;
import org.dromara.hutool.core.lang.wrapper.SimpleWrapper;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
public class CsvTokener extends SimpleWrapper<Reader> implements Closeable {
/**
* 在Reader的位置解析到第几个字符
*/
private long index;
/**
* 前一个字符
*/
private int prev;
/**
* 是否使用前一个字符
*/
private boolean usePrev;
/**
* 构造
*
* @param reader {@link Reader}
*/
public CsvTokener(final Reader reader) {
super(reader);
}
/**
* 读取下一个字符并记录位置
*
* @return 下一个字符
*/
public int next() {
if(this.usePrev){
this.usePrev = false;
return this.prev;
}
try {
this.prev = this.raw.read();
} catch (final IOException e) {
throw new IORuntimeException(e);
}
this.index++;
return this.prev;
}
/**
* 将标记回退到第一个字符
*
* @throws IllegalStateException 当多次调用back时抛出此异常
*/
public void back() throws IllegalStateException {
if (this.usePrev || this.index <= 0) {
throw new IllegalStateException("Stepping back two steps is not supported");
}
this.index --;
this.usePrev = true;
}
@Override
public void close() throws IOException {
IoUtil.nullSafeClose(this.raw);
}
}

View File

@ -29,7 +29,7 @@ public class CsvParserTest {
@Test
public void parseTest1() {
final StringReader reader = StrUtil.getReader("aaa,b\"bba\",ccc");
final CsvParser parser = new CsvParser(reader, null);
final CsvParser2 parser = new CsvParser2(reader, null);
final CsvRow row = parser.nextRow();
//noinspection ConstantConditions
Assertions.assertEquals("b\"bba\"", row.getRaw().get(1));
@ -39,7 +39,7 @@ public class CsvParserTest {
@Test
public void parseTest2() {
final StringReader reader = StrUtil.getReader("aaa,\"bba\"bbb,ccc");
final CsvParser parser = new CsvParser(reader, null);
final CsvParser2 parser = new CsvParser2(reader, null);
final CsvRow row = parser.nextRow();
//noinspection ConstantConditions
Assertions.assertEquals("\"bba\"bbb", row.getRaw().get(1));
@ -49,7 +49,7 @@ public class CsvParserTest {
@Test
public void parseTest3() {
final StringReader reader = StrUtil.getReader("aaa,\"bba\",ccc");
final CsvParser parser = new CsvParser(reader, null);
final CsvParser2 parser = new CsvParser2(reader, null);
final CsvRow row = parser.nextRow();
//noinspection ConstantConditions
Assertions.assertEquals("bba", row.getRaw().get(1));
@ -59,7 +59,7 @@ public class CsvParserTest {
@Test
public void parseTest4() {
final StringReader reader = StrUtil.getReader("aaa,\"\",ccc");
final CsvParser parser = new CsvParser(reader, null);
final CsvParser2 parser = new CsvParser2(reader, null);
final CsvRow row = parser.nextRow();
//noinspection ConstantConditions
Assertions.assertEquals("", row.getRaw().get(1));