From f3238ae1e70d4f1323f47526a771516038da5404 Mon Sep 17 00:00:00 2001 From: Looly Date: Mon, 4 Jan 2021 05:36:20 +0800 Subject: [PATCH] fix CsvParser --- .../java/cn/hutool/core/io/BufferUtil.java | 85 ++++---- .../cn/hutool/core/text/csv/CsvParser.java | 186 +++++++++++++----- .../cn/hutool/core/text/csv/CsvUtilTest.java | 6 + hutool-core/src/test/resources/test.csv | 1 + 4 files changed, 190 insertions(+), 88 deletions(-) diff --git a/hutool-core/src/main/java/cn/hutool/core/io/BufferUtil.java b/hutool-core/src/main/java/cn/hutool/core/io/BufferUtil.java index fb15159da..007e0194e 100644 --- a/hutool-core/src/main/java/cn/hutool/core/io/BufferUtil.java +++ b/hutool-core/src/main/java/cn/hutool/core/io/BufferUtil.java @@ -1,28 +1,28 @@ package cn.hutool.core.io; -import java.nio.ByteBuffer; -import java.nio.charset.Charset; - import cn.hutool.core.util.CharsetUtil; import cn.hutool.core.util.StrUtil; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; + /** * {@link ByteBuffer} 工具类
* 此工具来自于 t-io 项目以及其它项目的相关部分收集
* ByteBuffer的相关介绍见:https://www.cnblogs.com/ruber/p/6857159.html - * + * * @author tanyaowu, looly * @since 4.0.0 - * */ public class BufferUtil { /** * 拷贝到一个新的ByteBuffer - * - * @param src 源ByteBuffer + * + * @param src 源ByteBuffer * @param start 起始位置(包括) - * @param end 结束位置(不包括) + * @param end 结束位置(不包括) * @return 新的ByteBuffer */ public static ByteBuffer copy(ByteBuffer src, int start, int end) { @@ -31,8 +31,8 @@ public class BufferUtil { /** * 拷贝ByteBuffer - * - * @param src 源ByteBuffer + * + * @param src 源ByteBuffer * @param dest 目标ByteBuffer * @return 目标ByteBuffer */ @@ -42,9 +42,9 @@ public class BufferUtil { /** * 拷贝ByteBuffer - * - * @param src 源ByteBuffer - * @param dest 目标ByteBuffer + * + * @param src 源ByteBuffer + * @param dest 目标ByteBuffer * @param length 长度 * @return 目标ByteBuffer */ @@ -54,12 +54,12 @@ public class BufferUtil { /** * 拷贝ByteBuffer - * - * @param src 源ByteBuffer - * @param srcStart 源开始的位置 - * @param dest 目标ByteBuffer + * + * @param src 源ByteBuffer + * @param srcStart 源开始的位置 + * @param dest 目标ByteBuffer * @param destStart 目标开始的位置 - * @param length 长度 + * @param length 长度 * @return 目标ByteBuffer */ public static ByteBuffer copy(ByteBuffer src, int srcStart, ByteBuffer dest, int destStart, int length) { @@ -69,7 +69,7 @@ public class BufferUtil { /** * 读取剩余部分并转为UTF-8编码字符串 - * + * * @param buffer ByteBuffer * @return 字符串 * @since 4.5.0 @@ -80,8 +80,8 @@ public class BufferUtil { /** * 读取剩余部分并转为字符串 - * - * @param buffer ByteBuffer + * + * @param buffer ByteBuffer * @param charset 编码 * @return 字符串 * @since 4.5.0 @@ -92,7 +92,7 @@ public class BufferUtil { /** * 读取剩余部分bytes
- * + * * @param buffer ByteBuffer * @return bytes */ @@ -106,8 +106,8 @@ public class BufferUtil { /** * 读取指定长度的bytes
* 如果长度不足,则读取剩余部分,此时buffer必须为读模式 - * - * @param buffer ByteBuffer + * + * @param buffer ByteBuffer * @param maxLength 最大长度 * @return bytes */ @@ -123,10 +123,10 @@ public class BufferUtil { /** * 读取指定区间的数据 - * + * * @param buffer {@link ByteBuffer} - * @param start 开始位置 - * @param end 结束位置 + * @param start 开始位置 + * @param end 结束位置 * @return bytes */ public static byte[] readBytes(ByteBuffer buffer, int start, int end) { @@ -148,13 +148,13 @@ public class BufferUtil { /** * 一行的末尾位置,查找位置时位移ByteBuffer到结束位置
* 支持的换行符如下: - * + * *
 	 * 1. \r\n
 	 * 2. \n
 	 * 
* - * @param buffer {@link ByteBuffer} + * @param buffer {@link ByteBuffer} * @param maxLength 读取最大长度 * @return 末尾位置,未找到或达到最大长度返回-1 */ @@ -191,13 +191,13 @@ public class BufferUtil { /** * 读取一行,如果buffer中最后一部分并非完整一行,则返回null
* 支持的换行符如下: - * + * *
 	 * 1. \r\n
 	 * 2. \n
 	 * 
- * - * @param buffer ByteBuffer + * + * @param buffer ByteBuffer * @param charset 编码 * @return 一行 */ @@ -217,7 +217,7 @@ public class BufferUtil { /** * 创建新Buffer - * + * * @param data 数据 * @return {@link ByteBuffer} * @since 4.5.0 @@ -228,8 +228,8 @@ public class BufferUtil { /** * 从字符串创建新Buffer - * - * @param data 数据 + * + * @param data 数据 * @param charset 编码 * @return {@link ByteBuffer} * @since 4.5.0 @@ -237,10 +237,10 @@ public class BufferUtil { public static ByteBuffer create(CharSequence data, Charset charset) { return create(StrUtil.bytes(data, charset)); } - + /** * 从字符串创建新Buffer,使用UTF-8编码 - * + * * @param data 数据 * @return {@link ByteBuffer} * @since 4.5.0 @@ -248,4 +248,15 @@ public class BufferUtil { public static ByteBuffer createUtf8(CharSequence data) { return create(StrUtil.utf8Bytes(data)); } + + /** + * 创建{@link CharBuffer} + * + * @param capacity 容量 + * @return {@link CharBuffer} + * @since 5.5.7 + */ + public static CharBuffer createCharBuffer(int capacity) { + return CharBuffer.allocate(capacity); + } } diff --git a/hutool-core/src/main/java/cn/hutool/core/text/csv/CsvParser.java b/hutool-core/src/main/java/cn/hutool/core/text/csv/CsvParser.java index 0a362ed93..440556216 100644 --- a/hutool-core/src/main/java/cn/hutool/core/text/csv/CsvParser.java +++ b/hutool-core/src/main/java/cn/hutool/core/text/csv/CsvParser.java @@ -31,19 +31,7 @@ public final class CsvParser implements Closeable, Serializable { private final Reader reader; private final CsvReadConfig config; - private final char[] buf = new char[IoUtil.DEFAULT_LARGE_BUFFER_SIZE]; - /** - * 当前位置 - */ - private int bufPos; - /** - * 读取一段后数据长度 - */ - private int bufLen; - /** - * 拷贝开始的位置,一般为上一行的结束位置 - */ - private int copyStart; + private final Buffer buf = new Buffer(IoUtil.DEFAULT_LARGE_BUFFER_SIZE); /** * 前一个特殊分界字符 */ @@ -70,7 +58,7 @@ public final class CsvParser implements Closeable, Serializable { */ private int firstLineFieldCount = -1; /** - * 最大字段数量 + * 最大字段数量,用于初始化行,减少扩容 */ private int maxFieldCount; /** @@ -181,42 +169,55 @@ public final class CsvParser implements Closeable, Serializable { private List readLine() throws IORuntimeException { final List currentFields = new ArrayList<>(maxFieldCount > 0 ? maxFieldCount : DEFAULT_ROW_CAPACITY); - final StrBuilder localCurrentField = currentField; - final char[] localBuf = this.buf; - int localBufPos = bufPos;//当前位置 - int localPreChar = preChar;//前一个特殊分界字符 - int localCopyStart = copyStart;//拷贝起始位置 + final StrBuilder currentField = this.currentField; + final Buffer buf = this.buf; + int preChar = this.preChar;//前一个特殊分界字符 int copyLen = 0; //拷贝长度 + boolean lineStart = true; + boolean inComment = false; while (true) { - if (bufLen == localBufPos) { + if (false == buf.hasRemaining()) { // 此Buffer读取结束,开始读取下一段 - if (copyLen > 0) { - localCurrentField.append(localBuf, localCopyStart, copyLen); + buf.appendTo(currentField, copyLen); + // 此处无需mark,read方法会重置mark } - try { - bufLen = reader.read(localBuf); - } catch (IOException e) { - throw new IORuntimeException(e); - } - - if (bufLen < 0) { + if (buf.read(this.reader) < 0) { // CSV读取结束 finished = true; - if (localPreChar == config.fieldSeparator || localCurrentField.hasContent()) { + if (currentField.hasContent() || preChar == config.fieldSeparator) { //剩余部分作为一个字段 - addField(currentFields, localCurrentField.toStringAndReset()); + addField(currentFields, currentField.toStringAndReset()); } break; } //重置 - localCopyStart = localBufPos = copyLen = 0; + copyLen = 0; } - final char c = localBuf[localBufPos++]; + final char c = buf.get(); + + // 注释行标记 + if(lineStart){ + if(c == this.config.commentCharacter){ + inComment = true; + } + lineStart = false; + } + // 注释行处理 + if(inComment){ + if ((c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR) { + // 注释行以换行符为结尾 + inComment = false; + } + // 跳过注释行中的任何字符 + buf.mark(); + preChar = c; + continue; + } if (inQuotes) { //引号内,做为内容,直到引号结束 @@ -224,21 +225,23 @@ public final class CsvParser implements Closeable, Serializable { // End of quoted text inQuotes = false; } else { - if ((c == CharUtil.CR || c == CharUtil.LF) && localPreChar != CharUtil.CR) { + // 新行 + if ((c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR) { lineNo++; } } + // 普通字段字符 copyLen++; } else { // 非引号内 if (c == config.fieldSeparator) { //一个字段结束 if (copyLen > 0) { - localCurrentField.append(localBuf, localCopyStart, copyLen); + buf.appendTo(currentField, copyLen); copyLen = 0; } - addField(currentFields, localCurrentField.toStringAndReset()); - localCopyStart = localBufPos; + buf.mark(); + addField(currentFields, currentField.toStringAndReset()); } else if (c == config.textDelimiter) { // 引号开始 inQuotes = true; @@ -246,37 +249,36 @@ public final class CsvParser implements Closeable, Serializable { } else if (c == CharUtil.CR) { // \r,直接结束 if (copyLen > 0) { - localCurrentField.append(localBuf, localCopyStart, copyLen); + buf.appendTo(currentField, copyLen); } - addField(currentFields, localCurrentField.toStringAndReset()); - localPreChar = c; - localCopyStart = localBufPos; + buf.mark(); + addField(currentFields, currentField.toStringAndReset()); + preChar = c; break; } else if (c == CharUtil.LF) { // \n - if (localPreChar != CharUtil.CR) { + if (preChar != CharUtil.CR) { if (copyLen > 0) { - localCurrentField.append(localBuf, localCopyStart, copyLen); + buf.appendTo(currentField, copyLen); } - addField(currentFields, localCurrentField.toStringAndReset()); - localPreChar = c; - localCopyStart = localBufPos; + buf.mark(); + addField(currentFields, currentField.toStringAndReset()); + preChar = c; break; } // 前一个字符是\r,已经处理过这个字段了,此处直接跳过 - localCopyStart = localBufPos; + buf.mark(); } else { + // 普通字符 copyLen++; } } - localPreChar = c; + preChar = c; } // restore fields - bufPos = localBufPos; - preChar = localPreChar; - copyStart = localCopyStart; + this.preChar = preChar; return currentFields; } @@ -298,4 +300,86 @@ public final class CsvParser implements Closeable, Serializable { field = StrUtil.replace(field, "" + textDelimiter + textDelimiter, textDelimiter + ""); currentFields.add(StrUtil.unWrap(field, textDelimiter)); } + + /** + * 内部Buffer + * + * @author looly + */ + private static class Buffer { + final char[] buf; + + /** + * 标记位置,用于读数据 + */ + private int mark; + /** + * 当前位置 + */ + private int position; + /** + * 读取的数据长度,一般小于buf.length,-1表示无数据 + */ + private int limit; + + Buffer(int capacity) { + buf = new char[capacity]; + } + + /** + * 是否还有未读数据 + * + * @return 是否还有未读数据 + */ + public final boolean hasRemaining() { + return position < limit; + } + + /** + * 读取到缓存 + * + * @param reader {@link Reader} + */ + int read(Reader reader) { + int length; + try { + length = reader.read(this.buf); + } catch (IOException e) { + throw new IORuntimeException(e); + } + this.mark = 0; + this.position = 0; + this.limit = length; + return length; + } + + /** + * 先获取当前字符,再将当前位置后移一位
+ * 此方法不检查是否到了数组末尾,请自行使用{@link #hasRemaining()}判断。 + * + * @return 当前位置字符 + * @see #hasRemaining() + */ + char get() { + return this.buf[this.position++]; + } + + /** + * 标记位置记为下次读取位置 + */ + void mark() { + this.mark = this.position; + } + + /** + * 将数据追加到{@link StrBuilder},追加结束后需手动调用{@link #mark()} 重置读取位置 + * + * @param builder {@link StrBuilder} + * @param length 追加的长度 + * @see #mark() + */ + void appendTo(StrBuilder builder, int length) { + builder.append(this.buf, this.mark, length); + } + } } diff --git a/hutool-core/src/test/java/cn/hutool/core/text/csv/CsvUtilTest.java b/hutool-core/src/test/java/cn/hutool/core/text/csv/CsvUtilTest.java index 74b81a29c..0ace0b405 100644 --- a/hutool-core/src/test/java/cn/hutool/core/text/csv/CsvUtilTest.java +++ b/hutool-core/src/test/java/cn/hutool/core/text/csv/CsvUtilTest.java @@ -41,6 +41,12 @@ public class CsvUtilTest { Assert.assertEquals("\"", csvRow.get(6)); }); } + + @Test + public void readTest3() { + CsvReader reader = CsvUtil.getReader(); + reader.read(FileUtil.getUtf8Reader("test.csv"), Console::log); + } @Test @Ignore diff --git a/hutool-core/src/test/resources/test.csv b/hutool-core/src/test/resources/test.csv index 6c6ee2ff4..9c5057d09 100644 --- a/hutool-core/src/test/resources/test.csv +++ b/hutool-core/src/test/resources/test.csv @@ -1 +1,2 @@ +# 这是一行注释,读取时应忽略 "sss,sss",姓名,"性别",关注"对象",年龄,"",""" \ No newline at end of file