csv support beginLineNo

This commit is contained in:
Looly 2021-07-09 22:51:35 +08:00
parent d68cc83b7d
commit 9fd7c02c86
8 changed files with 147 additions and 14 deletions

View File

@ -15,6 +15,7 @@
* 【setting】 Props增加toProperties方法issue#1701@Github
* 【http 】 UserAgent增加getOsVersion方法issue#I3YZUQ@Gitee
* 【jwt 】 JWT增加validate方法issue#I3YDM4@Gitee
* 【core 】 CscReader支持指定读取开始行号和结束行号issue#I3ZMZL@Gitee
### 🐞Bug修复
* 【core 】 修复RadixUtil.decode非static问题issue#I3YPEH@Gitee

View File

@ -69,4 +69,12 @@ public class CsvData implements Iterable<CsvRow>, Serializable {
public Iterator<CsvRow> iterator() {
return this.rows.iterator();
}
@Override
public String toString() {
return "CsvData{" +
"header=" + header +
", rows=" + rows +
'}';
}
}

View File

@ -52,7 +52,11 @@ public final class CsvParser implements Closeable, Serializable {
/**
* 当前行号
*/
private long lineNo;
private long lineNo = -1;
/**
* 引号内的行数
*/
private long inQuotesLineCount;
/**
* 第一行字段数用于检查每行字段数是否一致
*/
@ -87,7 +91,7 @@ public final class CsvParser implements Closeable, Serializable {
if (false == config.containsHeader) {
throw new IllegalStateException("No header available - header parsing is disabled");
}
if (lineNo == 0) {
if (lineNo < config.beginLineNo) {
throw new IllegalStateException("No header available - call nextRow() first");
}
return header.fields;
@ -100,25 +104,35 @@ public final class CsvParser implements Closeable, Serializable {
* @throws IORuntimeException IO读取异常
*/
public CsvRow nextRow() throws IORuntimeException {
long startingLineNo;
List<String> currentFields;
int fieldCount;
while (false == finished) {
startingLineNo = ++lineNo;
currentFields = readLine();
fieldCount = currentFields.size();
if (fieldCount < 1) {
// 空List表示读取结束
break;
}
// 读取范围校验
if(lineNo < config.beginLineNo){
// 未达到读取起始行继续
continue;
}
if(lineNo > config.endLineNo){
// 超出结束行读取结束
break;
}
// 跳过空行
if (config.skipEmptyRows && fieldCount == 1 && currentFields.get(0).isEmpty()) {
// [""]表示空行
continue;
}
// 检查每行的字段数是否一致
if (config.errorOnDifferentFieldCount) {
if (firstLineFieldCount == -1) {
if (firstLineFieldCount < 0) {
firstLineFieldCount = fieldCount;
} else if (fieldCount != firstLineFieldCount) {
throw new IORuntimeException(String.format("Line %d has %d fields, but first line has %d fields", lineNo, fieldCount, firstLineFieldCount));
@ -137,7 +151,7 @@ public final class CsvParser implements Closeable, Serializable {
continue;
}
return new CsvRow(startingLineNo, null == header ? null : header.headerMap, currentFields);
return new CsvRow(lineNo, null == header ? null : header.headerMap, currentFields);
}
return null;
@ -161,12 +175,24 @@ public final class CsvParser implements Closeable, Serializable {
}
/**
* 读取一行数据
* 读取一行数据如果读取结束返回size为0的List<br>
* 空行是size为1的List唯一元素是""
*
* <p>
* 行号要考虑注释行和引号包装的内容中的换行
* </p>
*
* @return 一行数据
* @throws IORuntimeException IO异常
*/
private List<String> readLine() throws IORuntimeException {
// 矫正行号
// 当一行内容包含多行数据时记录首行行号但是读取下一行时需要把多行内容的行数加上
if(inQuotesLineCount > 0){
this.lineNo += this.inQuotesLineCount;
this.inQuotesLineCount = 0;
}
final List<String> currentFields = new ArrayList<>(maxFieldCount > 0 ? maxFieldCount : DEFAULT_ROW_CAPACITY);
final StrBuilder currentField = this.currentField;
@ -211,6 +237,7 @@ public final class CsvParser implements Closeable, Serializable {
if(inComment){
if (c == CharUtil.CR || c == CharUtil.LF) {
// 注释行以换行符为结尾
lineNo++;
inComment = false;
}
// 跳过注释行中的任何字符
@ -225,9 +252,9 @@ public final class CsvParser implements Closeable, Serializable {
// End of quoted text
inQuotes = false;
} else {
// 新行
if ((c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR) {
lineNo++;
// 字段内容中新行
if (isLineEnd(c)) {
inQuotesLineCount++;
}
}
// 普通字段字符
@ -280,6 +307,7 @@ public final class CsvParser implements Closeable, Serializable {
// restore fields
this.preChar = preChar;
lineNo++;
return currentFields;
}
@ -301,12 +329,24 @@ public final class CsvParser implements Closeable, Serializable {
currentFields.add(field);
}
/**
* 是否行结束符
* @param c 符号
* @return 是否结束
* @since 5.7.4
*/
private boolean isLineEnd(char c){
return (c == CharUtil.CR || c == CharUtil.LF) && preChar != CharUtil.CR;
}
/**
* 内部Buffer
*
* @author looly
*/
private static class Buffer {
private static class Buffer implements Serializable{
private static final long serialVersionUID = 1L;
final char[] buf;
/**

View File

@ -17,6 +17,10 @@ public class CsvReadConfig extends CsvConfig implements Serializable {
protected boolean skipEmptyRows = true;
/** 每行字段个数不同时是否抛出异常默认false */
protected boolean errorOnDifferentFieldCount;
/** 定义开始的行(包括),此处为原始文件行号 */
protected long beginLineNo;
/** 结束的行(包括),此处为原始文件行号 */
protected long endLineNo = Long.MAX_VALUE-1;
/**
* 默认配置
@ -59,4 +63,28 @@ public class CsvReadConfig extends CsvConfig implements Serializable {
this.errorOnDifferentFieldCount = errorOnDifferentFieldCount;
return this;
}
/**
* 设置开始的行包括默认0此处为原始文件行号
*
* @param beginLineNo 开始的行号包括
* @return this
* @since 5.7.4
*/
public CsvReadConfig setBeginLineNo(long beginLineNo) {
this.beginLineNo = beginLineNo;
return this;
}
/**
* 设置结束的行包括默认不限制此处为原始文件行号
*
* @param endLineNo 结束的行号包括
* @return this
* @since 5.7.4
*/
public CsvReadConfig setEndLineNo(long endLineNo) {
this.endLineNo = endLineNo;
return this;
}
}

View File

@ -1,6 +1,7 @@
package cn.hutool.core.text.csv;
import cn.hutool.core.bean.BeanUtil;
import cn.hutool.core.lang.Assert;
import java.util.Collection;
import java.util.Iterator;
@ -30,14 +31,14 @@ public final class CsvRow implements List<String> {
* @param fields 数据列表
*/
public CsvRow(final long originalLineNumber, final Map<String, Integer> headerMap, final List<String> fields) {
Assert.notNull(fields, "fields must be not null!");
this.originalLineNumber = originalLineNumber;
this.headerMap = headerMap;
this.fields = fields;
}
/**
* 获取原始行号多行情况下为首行行号
* 获取原始行号多行情况下为首行行号忽略注释行
*
* @return the original line number 行号
*/

View File

@ -1,6 +1,7 @@
package cn.hutool.core.text.csv;
import cn.hutool.core.annotation.Alias;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.resource.ResourceUtil;
import cn.hutool.core.lang.Console;
@ -19,6 +20,7 @@ public class CsvReaderTest {
CsvReader reader = new CsvReader();
CsvData data = reader.read(ResourceUtil.getReader("test.csv", CharsetUtil.CHARSET_UTF_8));
Assert.assertEquals("sss,sss", data.getRow(0).get(0));
Assert.assertEquals(1, data.getRow(0).getOriginalLineNumber());
Assert.assertEquals("性别", data.getRow(0).get(2));
Assert.assertEquals("关注\"对象\"", data.getRow(0).get(3));
}
@ -97,4 +99,50 @@ public class CsvReaderTest {
Console.log(row.getByName("案件ID"));
}
}
@Test
public void lineNoTest(){
CsvReader reader = new CsvReader();
CsvData data = reader.read(ResourceUtil.getReader("test_lines.csv", CharsetUtil.CHARSET_UTF_8));
Assert.assertEquals(1, data.getRow(0).getOriginalLineNumber());
Assert.assertEquals("a,b,c,d", CollUtil.join(data.getRow(0), ","));
Assert.assertEquals(4, data.getRow(2).getOriginalLineNumber());
Assert.assertEquals("q,w,e,r,我是一段\n带换行的内容", CollUtil.join(data.getRow(2), ","));
// 文件中第3行数据对应原始行号是6从0开始
Assert.assertEquals(6, data.getRow(3).getOriginalLineNumber());
Assert.assertEquals("a,s,d,f", CollUtil.join(data.getRow(3), ","));
}
@Test
public void lineLimitTest(){
// 从原始第2行开始读取
CsvReader reader = new CsvReader(CsvReadConfig.defaultConfig().setBeginLineNo(2));
CsvData data = reader.read(ResourceUtil.getReader("test_lines.csv", CharsetUtil.CHARSET_UTF_8));
Assert.assertEquals(2, data.getRow(0).getOriginalLineNumber());
Assert.assertEquals("1,2,3,4", CollUtil.join(data.getRow(0), ","));
Assert.assertEquals(4, data.getRow(1).getOriginalLineNumber());
Assert.assertEquals("q,w,e,r,我是一段\n带换行的内容", CollUtil.join(data.getRow(1), ","));
// 文件中第3行数据对应原始行号是6从0开始
Assert.assertEquals(6, data.getRow(2).getOriginalLineNumber());
Assert.assertEquals("a,s,d,f", CollUtil.join(data.getRow(2), ","));
}
@Test
public void lineLimitWithHeaderTest(){
// 从原始第2行开始读取
CsvReader reader = new CsvReader(CsvReadConfig.defaultConfig().setBeginLineNo(2).setContainsHeader(true));
CsvData data = reader.read(ResourceUtil.getReader("test_lines.csv", CharsetUtil.CHARSET_UTF_8));
Assert.assertEquals(4, data.getRow(0).getOriginalLineNumber());
Assert.assertEquals("q,w,e,r,我是一段\n带换行的内容", CollUtil.join(data.getRow(0), ","));
// 文件中第3行数据对应原始行号是6从0开始
Assert.assertEquals(6, data.getRow(1).getOriginalLineNumber());
Assert.assertEquals("a,s,d,f", CollUtil.join(data.getRow(1), ","));
}
}

View File

@ -1,2 +1,2 @@
# 这是一行注释,读取时应忽略
"sss,sss",姓名,"性别",关注"对象",年龄,"","""
"sss,sss",姓名,"性别",关注"对象",年龄,"","""

Can't render this file because it contains an unexpected character in line 2 and column 33.

View File

@ -0,0 +1,7 @@
# 这是一行注释,读取时应忽略
a,b,c,d
1,2,3,4
# 这是一行注释,读取时应忽略
q,w,e,r,"我是一段
带换行的内容"
a,s,d,f
Can't render this file because it has a wrong number of fields in line 2.