fix CsvParser

This commit is contained in:
Looly 2021-01-04 05:36:20 +08:00
parent 3c2f0e46b0
commit f3238ae1e7
4 changed files with 190 additions and 88 deletions

View File

@ -1,11 +1,12 @@
package cn.hutool.core.io; package cn.hutool.core.io;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import cn.hutool.core.util.CharsetUtil; import cn.hutool.core.util.CharsetUtil;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
/** /**
* {@link ByteBuffer} 工具类<br> * {@link ByteBuffer} 工具类<br>
* 此工具来自于 t-io 项目以及其它项目的相关部分收集<br> * 此工具来自于 t-io 项目以及其它项目的相关部分收集<br>
@ -13,7 +14,6 @@ import cn.hutool.core.util.StrUtil;
* *
* @author tanyaowu, looly * @author tanyaowu, looly
* @since 4.0.0 * @since 4.0.0
*
*/ */
public class BufferUtil { public class BufferUtil {
@ -248,4 +248,15 @@ public class BufferUtil {
public static ByteBuffer createUtf8(CharSequence data) { public static ByteBuffer createUtf8(CharSequence data) {
return create(StrUtil.utf8Bytes(data)); return create(StrUtil.utf8Bytes(data));
} }
/**
* 创建{@link CharBuffer}
*
* @param capacity 容量
* @return {@link CharBuffer}
* @since 5.5.7
*/
public static CharBuffer createCharBuffer(int capacity) {
return CharBuffer.allocate(capacity);
}
} }

View File

@ -31,19 +31,7 @@ public final class CsvParser implements Closeable, Serializable {
private final Reader reader; private final Reader reader;
private final CsvReadConfig config; private final CsvReadConfig config;
private final char[] buf = new char[IoUtil.DEFAULT_LARGE_BUFFER_SIZE]; private final Buffer buf = new Buffer(IoUtil.DEFAULT_LARGE_BUFFER_SIZE);
/**
* 当前位置
*/
private int bufPos;
/**
* 读取一段后数据长度
*/
private int bufLen;
/**
* 拷贝开始的位置一般为上一行的结束位置
*/
private int copyStart;
/** /**
* 前一个特殊分界字符 * 前一个特殊分界字符
*/ */
@ -70,7 +58,7 @@ public final class CsvParser implements Closeable, Serializable {
*/ */
private int firstLineFieldCount = -1; private int firstLineFieldCount = -1;
/** /**
* 最大字段数量 * 最大字段数量用于初始化行减少扩容
*/ */
private int maxFieldCount; private int maxFieldCount;
/** /**
@ -181,42 +169,55 @@ public final class CsvParser implements Closeable, Serializable {
private List<String> readLine() throws IORuntimeException { private List<String> readLine() throws IORuntimeException {
final List<String> currentFields = new ArrayList<>(maxFieldCount > 0 ? maxFieldCount : DEFAULT_ROW_CAPACITY); final List<String> currentFields = new ArrayList<>(maxFieldCount > 0 ? maxFieldCount : DEFAULT_ROW_CAPACITY);
final StrBuilder localCurrentField = currentField; final StrBuilder currentField = this.currentField;
final char[] localBuf = this.buf; final Buffer buf = this.buf;
int localBufPos = bufPos;//当前位置 int preChar = this.preChar;//前一个特殊分界字符
int localPreChar = preChar;//前一个特殊分界字符
int localCopyStart = copyStart;//拷贝起始位置
int copyLen = 0; //拷贝长度 int copyLen = 0; //拷贝长度
boolean lineStart = true;
boolean inComment = false;
while (true) { while (true) {
if (bufLen == localBufPos) { if (false == buf.hasRemaining()) {
// 此Buffer读取结束开始读取下一段 // 此Buffer读取结束开始读取下一段
if (copyLen > 0) { if (copyLen > 0) {
localCurrentField.append(localBuf, localCopyStart, copyLen); buf.appendTo(currentField, copyLen);
// 此处无需markread方法会重置mark
} }
try { if (buf.read(this.reader) < 0) {
bufLen = reader.read(localBuf);
} catch (IOException e) {
throw new IORuntimeException(e);
}
if (bufLen < 0) {
// CSV读取结束 // CSV读取结束
finished = true; finished = true;
if (localPreChar == config.fieldSeparator || localCurrentField.hasContent()) { if (currentField.hasContent() || preChar == config.fieldSeparator) {
//剩余部分作为一个字段 //剩余部分作为一个字段
addField(currentFields, localCurrentField.toStringAndReset()); addField(currentFields, currentField.toStringAndReset());
} }
break; break;
} }
//重置 //重置
localCopyStart = localBufPos = copyLen = 0; copyLen = 0;
} }
final char c = localBuf[localBufPos++]; final char c = buf.get();
// 注释行标记
if(lineStart){
if(c == this.config.commentCharacter){
inComment = true;
}
lineStart = false;
}
// 注释行处理
if(inComment){
if ((c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR) {
// 注释行以换行符为结尾
inComment = false;
}
// 跳过注释行中的任何字符
buf.mark();
preChar = c;
continue;
}
if (inQuotes) { if (inQuotes) {
//引号内做为内容直到引号结束 //引号内做为内容直到引号结束
@ -224,21 +225,23 @@ public final class CsvParser implements Closeable, Serializable {
// End of quoted text // End of quoted text
inQuotes = false; inQuotes = false;
} else { } else {
if ((c == CharUtil.CR || c == CharUtil.LF) && localPreChar != CharUtil.CR) { // 新行
if ((c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR) {
lineNo++; lineNo++;
} }
} }
// 普通字段字符
copyLen++; copyLen++;
} else { } else {
// 非引号内 // 非引号内
if (c == config.fieldSeparator) { if (c == config.fieldSeparator) {
//一个字段结束 //一个字段结束
if (copyLen > 0) { if (copyLen > 0) {
localCurrentField.append(localBuf, localCopyStart, copyLen); buf.appendTo(currentField, copyLen);
copyLen = 0; copyLen = 0;
} }
addField(currentFields, localCurrentField.toStringAndReset()); buf.mark();
localCopyStart = localBufPos; addField(currentFields, currentField.toStringAndReset());
} else if (c == config.textDelimiter) { } else if (c == config.textDelimiter) {
// 引号开始 // 引号开始
inQuotes = true; inQuotes = true;
@ -246,37 +249,36 @@ public final class CsvParser implements Closeable, Serializable {
} else if (c == CharUtil.CR) { } else if (c == CharUtil.CR) {
// \r直接结束 // \r直接结束
if (copyLen > 0) { if (copyLen > 0) {
localCurrentField.append(localBuf, localCopyStart, copyLen); buf.appendTo(currentField, copyLen);
} }
addField(currentFields, localCurrentField.toStringAndReset()); buf.mark();
localPreChar = c; addField(currentFields, currentField.toStringAndReset());
localCopyStart = localBufPos; preChar = c;
break; break;
} else if (c == CharUtil.LF) { } else if (c == CharUtil.LF) {
// \n // \n
if (localPreChar != CharUtil.CR) { if (preChar != CharUtil.CR) {
if (copyLen > 0) { if (copyLen > 0) {
localCurrentField.append(localBuf, localCopyStart, copyLen); buf.appendTo(currentField, copyLen);
} }
addField(currentFields, localCurrentField.toStringAndReset()); buf.mark();
localPreChar = c; addField(currentFields, currentField.toStringAndReset());
localCopyStart = localBufPos; preChar = c;
break; break;
} }
// 前一个字符是\r已经处理过这个字段了此处直接跳过 // 前一个字符是\r已经处理过这个字段了此处直接跳过
localCopyStart = localBufPos; buf.mark();
} else { } else {
// 普通字符
copyLen++; copyLen++;
} }
} }
localPreChar = c; preChar = c;
} }
// restore fields // restore fields
bufPos = localBufPos; this.preChar = preChar;
preChar = localPreChar;
copyStart = localCopyStart;
return currentFields; return currentFields;
} }
@ -298,4 +300,86 @@ public final class CsvParser implements Closeable, Serializable {
field = StrUtil.replace(field, "" + textDelimiter + textDelimiter, textDelimiter + ""); field = StrUtil.replace(field, "" + textDelimiter + textDelimiter, textDelimiter + "");
currentFields.add(StrUtil.unWrap(field, textDelimiter)); currentFields.add(StrUtil.unWrap(field, textDelimiter));
} }
/**
* 内部Buffer
*
* @author looly
*/
private static class Buffer {
final char[] buf;
/**
* 标记位置用于读数据
*/
private int mark;
/**
* 当前位置
*/
private int position;
/**
* 读取的数据长度一般小于buf.length-1表示无数据
*/
private int limit;
Buffer(int capacity) {
buf = new char[capacity];
}
/**
* 是否还有未读数据
*
* @return 是否还有未读数据
*/
public final boolean hasRemaining() {
return position < limit;
}
/**
* 读取到缓存
*
* @param reader {@link Reader}
*/
int read(Reader reader) {
int length;
try {
length = reader.read(this.buf);
} catch (IOException e) {
throw new IORuntimeException(e);
}
this.mark = 0;
this.position = 0;
this.limit = length;
return length;
}
/**
* 先获取当前字符再将当前位置后移一位<br>
* 此方法不检查是否到了数组末尾请自行使用{@link #hasRemaining()}判断
*
* @return 当前位置字符
* @see #hasRemaining()
*/
char get() {
return this.buf[this.position++];
}
/**
* 标记位置记为下次读取位置
*/
void mark() {
this.mark = this.position;
}
/**
* 将数据追加到{@link StrBuilder}追加结束后需手动调用{@link #mark()} 重置读取位置
*
* @param builder {@link StrBuilder}
* @param length 追加的长度
* @see #mark()
*/
void appendTo(StrBuilder builder, int length) {
builder.append(this.buf, this.mark, length);
}
}
} }

View File

@ -42,6 +42,12 @@ public class CsvUtilTest {
}); });
} }
@Test
public void readTest3() {
CsvReader reader = CsvUtil.getReader();
reader.read(FileUtil.getUtf8Reader("test.csv"), Console::log);
}
@Test @Test
@Ignore @Ignore
public void writeTest() { public void writeTest() {

View File

@ -1 +1,2 @@
# 这是一行注释,读取时应忽略
"sss,sss",姓名,"性别",关注"对象",年龄,"",""" "sss,sss",姓名,"性别",关注"对象",年龄,"","""
Can't render this file because it contains an unexpected character in line 1 and column 33.