This commit is contained in:
Looly 2022-02-07 19:04:37 +08:00
parent bbb12fa22d
commit d5916b9998
6 changed files with 60 additions and 36 deletions

View File

@ -23,6 +23,7 @@
* 【core 】 修复ChineseDate农历获取正月出现数组越界BUGissue#2112@Github
* 【extra 】 修复EmojiUtil.toHtmlHex()方法pr#519@Gitee
* 【system 】 修复CpuInfo.getUsed()方法issue#2116@Github
* 【dfa 】 修复密集匹配和贪婪匹配冲突问题issue#2126@Github
-------------------------------------------------------------------------------------------------------------
# 5.7.20 (2022-01-20)

View File

@ -17,11 +17,6 @@
<description>Hutool 基于DFA的关键词查找</description>
<dependencies>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-core</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-json</artifactId>

View File

@ -195,11 +195,21 @@ public final class SensitiveUtil {
*/
public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
String jsonText = JSONUtil.toJsonStr(bean);
@SuppressWarnings("unchecked")
final Class<T> c = (Class<T>) bean.getClass();
@SuppressWarnings("unchecked") final Class<T> c = (Class<T>) bean.getClass();
return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
}
/**
* 处理过滤文本中的敏感词默认替换成*
*
* @param text 文本
* @return 敏感词过滤处理后的文本
* @since 5.7.21
*/
public static String sensitiveFilter(String text) {
return sensitiveFilter(text, true, null);
}
/**
* 处理过滤文本中的敏感词默认替换成*
*
@ -214,13 +224,14 @@ public final class SensitiveUtil {
}
//敏感词过滤场景下不需要密集匹配
List<FoundWord> foundWordList = getFoundAllSensitive(text, false, isGreedMatch);
List<FoundWord> foundWordList = getFoundAllSensitive(text, true, isGreedMatch);
if (CollUtil.isEmpty(foundWordList)) {
return text;
}
sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
} : sensitiveProcessor;
Map<Integer, FoundWord> foundWordMap = new HashMap<>(foundWordList.size());
final Map<Integer, FoundWord> foundWordMap = new HashMap<>(foundWordList.size(), 1);
foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord));
int length = text.length();
StringBuilder textStringBuilder = new StringBuilder();

View File

@ -3,7 +3,6 @@ package cn.hutool.dfa;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Filter;
import cn.hutool.core.text.StrBuilder;
import cn.hutool.core.util.StrUtil;
import java.util.ArrayList;
@ -247,15 +246,15 @@ public class WordTree extends HashMap<Character, WordTree> {
List<FoundWord> foundWords = new ArrayList<>();
WordTree current = this;
int length = text.length();
final int length = text.length();
final Filter<Character> charFilter = this.charFilter;
//存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空
final StrBuilder wordBuffer = StrUtil.strBuilder();
final StrBuilder keyBuffer = StrUtil.strBuilder();
final StringBuilder wordBuffer = StrUtil.builder();
final StringBuilder keyBuffer = StrUtil.builder();
char currentChar;
for (int i = 0; i < length; i++) {
wordBuffer.reset();
keyBuffer.reset();
wordBuffer.setLength(0);
keyBuffer.setLength(0);
for (int j = i; j < length; j++) {
currentChar = text.charAt(j);
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
@ -284,6 +283,7 @@ public class WordTree extends HashMap<Character, WordTree> {
if (false == isDensityMatch) {
//如果非密度匹配跳过匹配到的词
i = j;
break;
}
if (false == isGreedMatch) {
//如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配

View File

@ -47,7 +47,7 @@ public class DfaTest {
}
/**
* 贪婪匹配原则测试
* 贪婪非密集匹配原则测试
*/
@Test
public void greedMatchTest() {
@ -56,15 +56,15 @@ public class DfaTest {
// -----------------------------------------------------------------------------------------------------------------------------------
// 情况三匹配到最长关键词跳过已经匹配的关键词
// 匹配到由于到最长匹配因此土豆接着被匹配
// 由于大土豆被匹配土豆被跳过由于刚出锅匹配出锅被跳过
// 匹配到由于非密集匹配因此从下一个字符开始查找匹配到土豆接着被匹配
// 由于刚出锅被匹配由于非密集匹配出锅被跳过
List<String> matchAll = tree.matchAll(text, -1, false, true);
Assert.assertEquals(matchAll, CollUtil.newArrayList("", "土^豆", "刚出锅"));
Assert.assertEquals(matchAll, CollUtil.newArrayList("", "土^豆", "刚出锅"));
}
/**
* 密集匹配原则匹配和贪婪匹配原则测试
* 密集匹配原则匹配和贪婪匹配原则测试
*/
@Test
public void densityAndGreedMatchTest() {
@ -80,6 +80,29 @@ public class DfaTest {
}
@Test
public void densityAndGreedMatchTest2(){
WordTree tree = new WordTree();
tree.addWord("");
tree.addWord("赵阿");
tree.addWord("赵阿三");
final List<FoundWord> result = tree.matchAllWords("赵阿三在做什么", -1, true, true);
Assert.assertEquals(3, result.size());
Assert.assertEquals("", result.get(0).getWord());
Assert.assertEquals(0, result.get(0).getStartIndex().intValue());
Assert.assertEquals(0, result.get(0).getEndIndex().intValue());
Assert.assertEquals("赵阿", result.get(1).getWord());
Assert.assertEquals(0, result.get(1).getStartIndex().intValue());
Assert.assertEquals(1, result.get(1).getEndIndex().intValue());
Assert.assertEquals("赵阿三", result.get(2).getWord());
Assert.assertEquals(0, result.get(2).getStartIndex().intValue());
Assert.assertEquals(2, result.get(2).getEndIndex().intValue());
}
/**
* 停顿词测试
*/

View File

@ -1,5 +1,7 @@
package cn.hutool.dfa;
import cn.hutool.core.collection.ListUtil;
import lombok.Data;
import org.junit.Assert;
import org.junit.Test;
@ -24,25 +26,17 @@ public class SensitiveUtilTest {
Assert.assertEquals(bean.getStr(), "我有一颗$*******的");
}
@Data
public static class TestBean {
private String str;
private Integer num;
public String getStr() {
return str;
}
public void setStr(String str) {
this.str = str;
}
@Test
public void issue2126(){
SensitiveUtil.init(ListUtil.of("", "赵阿", "赵阿三"));
public Integer getNum() {
return num;
}
public void setNum(Integer num) {
this.num = num;
String result = SensitiveUtil.sensitiveFilter("赵阿三在做什么。", true, null);
Assert.assertEquals("***在做什么。", result);
}
}
}