From 1a0ae090471113b6297337f48575d7b94b7edba9 Mon Sep 17 00:00:00 2001 From: Looly Date: Wed, 31 Jul 2024 01:14:39 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8DCsvParser=E4=B8=AD=E5=AF=B9?= =?UTF-8?q?=E6=AD=A3=E6=96=87=E4=B8=AD=E5=8F=8C=E5=BC=95=E5=8F=B7=E5=A4=84?= =?UTF-8?q?=E7=90=86=E9=80=BB=E8=BE=91=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hutool/core/text/CharSequenceUtil.java | 2 +- .../org/dromara/hutool/poi/csv/CsvParser.java | 56 +++++++++++++------ .../dromara/hutool/poi/csv/Pr1244Test.java | 21 ++++++- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java b/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java index 7b570ff67..01f2868e7 100644 --- a/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java +++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java @@ -2645,7 +2645,7 @@ public class CharSequenceUtil extends StrValidator { if (isEmpty(str)) { return toStringOrNull(str); } - if (str.charAt(0) == prefix && str.charAt(str.length() - 1) == suffix) { + if (isWrap(str, prefix, suffix)) { return sub(str, 1, str.length() - 1); } return str.toString(); diff --git a/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser.java b/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser.java index db8327a0f..a8538e7e6 100644 --- a/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser.java +++ b/hutool-poi/src/main/java/org/dromara/hutool/poi/csv/CsvParser.java @@ -92,8 +92,8 @@ public final class CsvParser extends ComputeIter implements Closeable, S /** * CSV解析器 * - * @param reader Reader - * @param config 配置,null则为默认配置 + * @param reader Reader + * @param config 配置,null则为默认配置 * @param bufferSize 默认缓存大小 */ public CsvParser(final Reader reader, final CsvReadConfig config, final int bufferSize) { @@ -109,7 +109,7 @@ public final class CsvParser extends ComputeIter implements Closeable, S * @throws IllegalStateException 如果不解析头部或者没有调用nextRow()方法 */ public List getHeader() { - if (config.headerLineNo < 0) { + if (config.headerLineNo < 0) { throw new IllegalStateException("No header available - header parsing is disabled"); } if (lineNo < config.beginLineNo) { @@ -141,11 +141,11 @@ public final class CsvParser extends ComputeIter implements Closeable, S } // 读取范围校验 - if(lineNo < config.beginLineNo){ + if (lineNo < config.beginLineNo) { // 未达到读取起始行,继续 continue; } - if(lineNo > config.endLineNo){ + if (lineNo > config.endLineNo) { // 超出结束行,读取结束 break; } @@ -209,7 +209,7 @@ public final class CsvParser extends ComputeIter implements Closeable, S * 空行是size为1的List,唯一元素是"" * *

- * 行号要考虑注释行和引号包装的内容中的换行 + * 行号要考虑注释行和引号包装的内容中的换行 *

* * @return 一行数据 @@ -218,7 +218,7 @@ public final class CsvParser extends ComputeIter implements Closeable, S private List readLine() throws IORuntimeException { // 矫正行号 // 当一行内容包含多行数据时,记录首行行号,但是读取下一行时,需要把多行内容的行数加上 - if(inQuotesLineCount > 0){ + if (inQuotesLineCount > 0) { this.lineNo += this.inQuotesLineCount; this.inQuotesLineCount = 0; } @@ -257,16 +257,16 @@ public final class CsvParser extends ComputeIter implements Closeable, S final char c = buf.get(); // 注释行标记 - if(preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF){ + if (preChar < 0 || preChar == CharUtil.CR || preChar == CharUtil.LF) { // 判断行首字符为指定注释字符的注释开始,直到遇到换行符 // 行首分两种,1是preChar < 0表示文本开始,2是换行符后紧跟就是下一行的开始 // issue#IA8WE0 如果注释符出现在包装符内,被认为是普通字符 - if(!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter){ + if (!inQuotes && null != this.config.commentCharacter && c == this.config.commentCharacter) { inComment = true; } } // 注释行处理 - if(inComment){ + if (inComment) { if (c == CharUtil.CR || c == CharUtil.LF) { // 注释行以换行符为结尾 lineNo++; @@ -302,8 +302,8 @@ public final class CsvParser extends ComputeIter implements Closeable, S buf.mark(); addField(currentFields, currentField.toString()); currentField.setLength(0); - } else if (c == config.textDelimiter) { - // 引号开始 + } else if (c == config.textDelimiter && isFieldBegin(preChar)) { + // 引号开始且出现在字段开头 inQuotes = true; copyLen++; } else if (c == CharUtil.CR) { @@ -361,11 +361,15 @@ public final class CsvParser extends ComputeIter implements Closeable, S final char textDelimiter = this.config.textDelimiter; // 忽略多余引号后的换行符 - field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c-> c == CharUtil.LF || c == CharUtil.CR)); + field = StrUtil.trim(field, StrTrimer.TrimMode.SUFFIX, (c -> c == CharUtil.LF || c == CharUtil.CR)); - field = StrUtil.unWrap(field, textDelimiter); - field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter)); - if(this.config.trimField){ + if(StrUtil.isWrap(field, textDelimiter)){ + field = StrUtil.sub(field, 1, field.length() - 1); + // https://datatracker.ietf.org/doc/html/rfc4180#section-2 + // 第七条规则,只有包装内的包装符需要转义 + field = StrUtil.replace(field, String.valueOf(textDelimiter) + textDelimiter, String.valueOf(textDelimiter)); + } + if (this.config.trimField) { // issue#I49M0C@Gitee field = StrUtil.trim(field); } @@ -384,12 +388,30 @@ public final class CsvParser extends ComputeIter implements Closeable, S return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR; } + /** + * 通过前一个字符,判断是否字段开始,几种情况: + *
    + *
  • 正文开头,无前字符
  • + *
  • 字段分隔符,即上个字段结束
  • + *
  • 换行符,即新行开始
  • + *
+ * + * @param preChar 前字符 + * @return 是否字段开始 + */ + private boolean isFieldBegin(final int preChar) { + return preChar == -1 + || preChar == config.fieldSeparator + || preChar == CharUtil.LF + || preChar == CharUtil.CR; + } + /** * 内部Buffer * * @author looly */ - private static class Buffer implements Serializable{ + private static class Buffer implements Serializable { private static final long serialVersionUID = 1L; final char[] buf; diff --git a/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/Pr1244Test.java b/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/Pr1244Test.java index c1ee2750e..dc04e92c1 100644 --- a/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/Pr1244Test.java +++ b/hutool-poi/src/test/java/org/dromara/hutool/poi/csv/Pr1244Test.java @@ -8,9 +8,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; /** * 按照 https://datatracker.ietf.org/doc/html/rfc4180#section-2
- * 如果字段正文中出现双引号,需要使用两个双引号表示转义 + * 如果字段正文中出现双引号,需要使用两个双引号表示转义,并整段使用引号包裹 */ public class Pr1244Test { + + /** + * 此测试中没有引号包裹,则所有引号都被当作内容 + */ @Test void csvReadTest() { final String csv = "a,q\"\"e,d,f"; @@ -18,6 +22,21 @@ public class Pr1244Test { final CsvData read = reader.read(); assertEquals(4, read.getRow(0).size()); assertEquals("a", read.getRow(0).get(0)); + assertEquals("q\"\"e", read.getRow(0).get(1)); + assertEquals("d", read.getRow(0).get(2)); + assertEquals("f", read.getRow(0).get(3)); + } + + /** + * 此测试中没有引号包裹,则所有引号都被当作内容 + */ + @Test + void csvReadTest2() { + final String csv = "a,q\"e,d,f"; + final CsvReader reader = CsvUtil.getReader(new StringReader(csv)); + final CsvData read = reader.read(); + assertEquals(4, read.getRow(0).size()); + assertEquals("a", read.getRow(0).get(0)); assertEquals("q\"e", read.getRow(0).get(1)); assertEquals("d", read.getRow(0).get(2)); assertEquals("f", read.getRow(0).get(3));