This commit is contained in:
Looly 2022-02-07 19:04:37 +08:00
parent bbb12fa22d
commit d5916b9998
6 changed files with 60 additions and 36 deletions

View File

@ -23,6 +23,7 @@
* 【core 】 修复ChineseDate农历获取正月出现数组越界BUGissue#2112@Github * 【core 】 修复ChineseDate农历获取正月出现数组越界BUGissue#2112@Github
* 【extra 】 修复EmojiUtil.toHtmlHex()方法pr#519@Gitee * 【extra 】 修复EmojiUtil.toHtmlHex()方法pr#519@Gitee
* 【system 】 修复CpuInfo.getUsed()方法issue#2116@Github * 【system 】 修复CpuInfo.getUsed()方法issue#2116@Github
* 【dfa 】 修复密集匹配和贪婪匹配冲突问题issue#2126@Github
------------------------------------------------------------------------------------------------------------- -------------------------------------------------------------------------------------------------------------
# 5.7.20 (2022-01-20) # 5.7.20 (2022-01-20)

View File

@ -17,11 +17,6 @@
<description>Hutool 基于DFA的关键词查找</description> <description>Hutool 基于DFA的关键词查找</description>
<dependencies> <dependencies>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-core</artifactId>
<version>${project.parent.version}</version>
</dependency>
<dependency> <dependency>
<groupId>cn.hutool</groupId> <groupId>cn.hutool</groupId>
<artifactId>hutool-json</artifactId> <artifactId>hutool-json</artifactId>

View File

@ -195,11 +195,21 @@ public final class SensitiveUtil {
*/ */
public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) { public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
String jsonText = JSONUtil.toJsonStr(bean); String jsonText = JSONUtil.toJsonStr(bean);
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked") final Class<T> c = (Class<T>) bean.getClass();
final Class<T> c = (Class<T>) bean.getClass();
return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c); return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
} }
/**
* 处理过滤文本中的敏感词默认替换成*
*
* @param text 文本
* @return 敏感词过滤处理后的文本
* @since 5.7.21
*/
public static String sensitiveFilter(String text) {
return sensitiveFilter(text, true, null);
}
/** /**
* 处理过滤文本中的敏感词默认替换成* * 处理过滤文本中的敏感词默认替换成*
* *
@ -214,13 +224,14 @@ public final class SensitiveUtil {
} }
//敏感词过滤场景下不需要密集匹配 //敏感词过滤场景下不需要密集匹配
List<FoundWord> foundWordList = getFoundAllSensitive(text, false, isGreedMatch); List<FoundWord> foundWordList = getFoundAllSensitive(text, true, isGreedMatch);
if (CollUtil.isEmpty(foundWordList)) { if (CollUtil.isEmpty(foundWordList)) {
return text; return text;
} }
sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() { sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
} : sensitiveProcessor; } : sensitiveProcessor;
Map<Integer, FoundWord> foundWordMap = new HashMap<>(foundWordList.size());
final Map<Integer, FoundWord> foundWordMap = new HashMap<>(foundWordList.size(), 1);
foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord)); foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord));
int length = text.length(); int length = text.length();
StringBuilder textStringBuilder = new StringBuilder(); StringBuilder textStringBuilder = new StringBuilder();

View File

@ -3,7 +3,6 @@ package cn.hutool.dfa;
import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Filter; import cn.hutool.core.lang.Filter;
import cn.hutool.core.text.StrBuilder;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import java.util.ArrayList; import java.util.ArrayList;
@ -247,15 +246,15 @@ public class WordTree extends HashMap<Character, WordTree> {
List<FoundWord> foundWords = new ArrayList<>(); List<FoundWord> foundWords = new ArrayList<>();
WordTree current = this; WordTree current = this;
int length = text.length(); final int length = text.length();
final Filter<Character> charFilter = this.charFilter; final Filter<Character> charFilter = this.charFilter;
//存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空 //存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空
final StrBuilder wordBuffer = StrUtil.strBuilder(); final StringBuilder wordBuffer = StrUtil.builder();
final StrBuilder keyBuffer = StrUtil.strBuilder(); final StringBuilder keyBuffer = StrUtil.builder();
char currentChar; char currentChar;
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
wordBuffer.reset(); wordBuffer.setLength(0);
keyBuffer.reset(); keyBuffer.setLength(0);
for (int j = i; j < length; j++) { for (int j = i; j < length; j++) {
currentChar = text.charAt(j); currentChar = text.charAt(j);
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar); // Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
@ -284,6 +283,7 @@ public class WordTree extends HashMap<Character, WordTree> {
if (false == isDensityMatch) { if (false == isDensityMatch) {
//如果非密度匹配跳过匹配到的词 //如果非密度匹配跳过匹配到的词
i = j; i = j;
break;
} }
if (false == isGreedMatch) { if (false == isGreedMatch) {
//如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配 //如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配

View File

@ -47,7 +47,7 @@ public class DfaTest {
} }
/** /**
* 贪婪匹配原则测试 * 贪婪非密集匹配原则测试
*/ */
@Test @Test
public void greedMatchTest() { public void greedMatchTest() {
@ -56,15 +56,15 @@ public class DfaTest {
// ----------------------------------------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------------------------------------
// 情况三匹配到最长关键词跳过已经匹配的关键词 // 情况三匹配到最长关键词跳过已经匹配的关键词
// 匹配到由于到最长匹配因此土豆接着被匹配 // 匹配到由于非密集匹配因此从下一个字符开始查找匹配到土豆接着被匹配
// 由于大土豆被匹配土豆被跳过由于刚出锅匹配出锅被跳过 // 由于刚出锅被匹配由于非密集匹配出锅被跳过
List<String> matchAll = tree.matchAll(text, -1, false, true); List<String> matchAll = tree.matchAll(text, -1, false, true);
Assert.assertEquals(matchAll, CollUtil.newArrayList("", "土^豆", "刚出锅")); Assert.assertEquals(matchAll, CollUtil.newArrayList("", "土^豆", "刚出锅"));
} }
/** /**
* 密集匹配原则匹配和贪婪匹配原则测试 * 密集匹配原则匹配和贪婪匹配原则测试
*/ */
@Test @Test
public void densityAndGreedMatchTest() { public void densityAndGreedMatchTest() {
@ -80,6 +80,29 @@ public class DfaTest {
} }
@Test
public void densityAndGreedMatchTest2(){
WordTree tree = new WordTree();
tree.addWord("");
tree.addWord("赵阿");
tree.addWord("赵阿三");
final List<FoundWord> result = tree.matchAllWords("赵阿三在做什么", -1, true, true);
Assert.assertEquals(3, result.size());
Assert.assertEquals("", result.get(0).getWord());
Assert.assertEquals(0, result.get(0).getStartIndex().intValue());
Assert.assertEquals(0, result.get(0).getEndIndex().intValue());
Assert.assertEquals("赵阿", result.get(1).getWord());
Assert.assertEquals(0, result.get(1).getStartIndex().intValue());
Assert.assertEquals(1, result.get(1).getEndIndex().intValue());
Assert.assertEquals("赵阿三", result.get(2).getWord());
Assert.assertEquals(0, result.get(2).getStartIndex().intValue());
Assert.assertEquals(2, result.get(2).getEndIndex().intValue());
}
/** /**
* 停顿词测试 * 停顿词测试
*/ */

View File

@ -1,5 +1,7 @@
package cn.hutool.dfa; package cn.hutool.dfa;
import cn.hutool.core.collection.ListUtil;
import lombok.Data;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
@ -24,25 +26,17 @@ public class SensitiveUtilTest {
Assert.assertEquals(bean.getStr(), "我有一颗$*******的"); Assert.assertEquals(bean.getStr(), "我有一颗$*******的");
} }
@Data
public static class TestBean { public static class TestBean {
private String str; private String str;
private Integer num; private Integer num;
public String getStr() {
return str;
}
public void setStr(String str) {
this.str = str;
}
public Integer getNum() {
return num;
}
public void setNum(Integer num) {
this.num = num;
}
} }
@Test
public void issue2126(){
SensitiveUtil.init(ListUtil.of("", "赵阿", "赵阿三"));
String result = SensitiveUtil.sensitiveFilter("赵阿三在做什么。", true, null);
Assert.assertEquals("***在做什么。", result);
}
} }