diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java
new file mode 100644
index 000000000..e57a0f9ec
--- /dev/null
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java
@@ -0,0 +1,49 @@
+package cn.hutool.dfa;
+
+/**
+ * @author 肖海斌
+ *
+ * 匹配到的敏感词,包含敏感词,text中匹配敏感词的内容,以及匹配内容在text中的下标,
+ * 下标可以用来做敏感词的进一步处理,如果替换成**
+ */
+public class FoundWord {
+ /**
+ * 生效的敏感词
+ */
+ private String word;
+ /**
+ * 敏感词匹配到的内容
+ */
+ private String foundWord;
+ /**
+ * 匹配内容在待分析字符串中的开始位置
+ */
+ private int startIndex;
+ /**
+ * 匹配内容在待分析字符串中的结束位置
+ */
+ private int endIndex;
+
+ public FoundWord(String word, String foundWord, int start, int end) {
+ this.word = word;
+ this.foundWord = foundWord;
+ this.startIndex = start;
+ this.endIndex = end;
+ }
+
+ public String getWord() {
+ return word;
+ }
+
+ public String getFoundWord() {
+ return foundWord;
+ }
+
+ public int getStartIndex() {
+ return startIndex;
+ }
+
+ public int getEndIndex() {
+ return endIndex;
+ }
+}
diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java
new file mode 100644
index 000000000..34c0128b8
--- /dev/null
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java
@@ -0,0 +1,22 @@
+package cn.hutool.dfa;
+
+/**
+ * @author 肖海斌
+ * 敏感词过滤处理器,默认按字符数替换成*
+ */
+public interface SensitiveProcessor {
+
+ /**
+ * 敏感词过滤处理
+ * @param foundWord 敏感词匹配到的内容
+ * @return 敏感词过滤后的内容,默认按字符数替换成*
+ */
+ default String process(FoundWord foundWord) {
+ int length = foundWord.getFoundWord().length();
+ StringBuilder sb = new StringBuilder(length);
+ for (int i = 0; i < length; i++) {
+ sb.append("*");
+ }
+ return sb.toString();
+ }
+}
diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
index d64100494..6a396d49d 100644
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java
@@ -1,77 +1,84 @@
package cn.hutool.dfa;
+import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Filter;
import cn.hutool.core.thread.ThreadUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil;
import java.util.Collection;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
/**
* 敏感词工具类
- * @author Looly
*
+ * @author Looly
*/
public final class SensitiveUtil {
public static final char DEFAULT_SEPARATOR = StrUtil.C_COMMA;
private static final WordTree sensitiveTree = new WordTree();
-
+
/**
* @return 是否已经被初始化
*/
- public static boolean isInited(){
+ public static boolean isInited() {
return !sensitiveTree.isEmpty();
}
-
+
/**
* 初始化敏感词树
- * @param isAsync 是否异步初始化
+ *
+ * @param isAsync 是否异步初始化
* @param sensitiveWords 敏感词列表
*/
- public static void init(final Collection sensitiveWords, boolean isAsync){
- if(isAsync){
+ public static void init(final Collection sensitiveWords, boolean isAsync) {
+ if (isAsync) {
ThreadUtil.execAsync(() -> {
init(sensitiveWords);
return true;
});
- }else{
+ } else {
init(sensitiveWords);
}
}
-
+
/**
* 初始化敏感词树
+ *
* @param sensitiveWords 敏感词列表
*/
- public static void init(Collection sensitiveWords){
+ public static void init(Collection sensitiveWords) {
sensitiveTree.clear();
sensitiveTree.addWords(sensitiveWords);
// log.debug("Sensitive init finished, sensitives: {}", sensitiveWords);
}
-
+
/**
* 初始化敏感词树
+ *
* @param sensitiveWords 敏感词列表组成的字符串
- * @param isAsync 是否异步初始化
- * @param separator 分隔符
+ * @param isAsync 是否异步初始化
+ * @param separator 分隔符
*/
- public static void init(String sensitiveWords, char separator, boolean isAsync){
- if(StrUtil.isNotBlank(sensitiveWords)){
+ public static void init(String sensitiveWords, char separator, boolean isAsync) {
+ if (StrUtil.isNotBlank(sensitiveWords)) {
init(StrUtil.split(sensitiveWords, separator), isAsync);
}
}
-
+
/**
* 初始化敏感词树,使用逗号分隔每个单词
+ *
* @param sensitiveWords 敏感词列表组成的字符串
- * @param isAsync 是否异步初始化
+ * @param isAsync 是否异步初始化
*/
- public static void init(String sensitiveWords, boolean isAsync){
+ public static void init(String sensitiveWords, boolean isAsync) {
init(sensitiveWords, DEFAULT_SEPARATOR, isAsync);
}
-
+
/**
* 设置字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符
* 当accept为false时,此字符不参与匹配
@@ -80,90 +87,144 @@ public final class SensitiveUtil {
* @since 5.4.4
*/
public static void setCharFilter(Filter charFilter) {
- if(charFilter != null) {
+ if (charFilter != null) {
sensitiveTree.setCharFilter(charFilter);
}
}
-
+
/**
* 是否包含敏感词
+ *
* @param text 文本
* @return 是否包含
*/
- public static boolean containsSensitive(String text){
+ public static boolean containsSensitive(String text) {
return sensitiveTree.isMatch(text);
}
-
+
/**
* 是否包含敏感词
+ *
* @param obj bean,会被转为JSON字符串
* @return 是否包含
*/
- public static boolean containsSensitive(Object obj){
+ public static boolean containsSensitive(Object obj) {
return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj));
}
-
+
/**
* 查找敏感词,返回找到的第一个敏感词
+ *
* @param text 文本
* @return 敏感词
*/
- public static String getFindedFirstSensitive(String text){
+ public static FoundWord getFindedFirstSensitive(String text) {
return sensitiveTree.match(text);
}
-
+
/**
* 查找敏感词,返回找到的第一个敏感词
+ *
* @param obj bean,会被转为JSON字符串
* @return 敏感词
*/
- public static String getFindedFirstSensitive(Object obj){
+ public static FoundWord getFindedFirstSensitive(Object obj) {
return sensitiveTree.match(JSONUtil.toJsonStr(obj));
}
-
+
/**
* 查找敏感词,返回找到的所有敏感词
+ *
* @param text 文本
* @return 敏感词
*/
- public static List getFindedAllSensitive(String text){
+ public static List getFindedAllSensitive(String text) {
return sensitiveTree.matchAll(text);
}
-
+
/**
* 查找敏感词,返回找到的所有敏感词
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
- *
- * @param text 文本
+ *
+ * @param text 文本
* @param isDensityMatch 是否使用密集匹配原则
- * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
+ * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
* @return 敏感词
*/
- public static List getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch){
+ public static List getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
}
-
+
/**
* 查找敏感词,返回找到的所有敏感词
+ *
* @param bean 对象,会被转为JSON
* @return 敏感词
*/
- public static List getFindedAllSensitive(Object bean){
+ public static List getFindedAllSensitive(Object bean) {
return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
}
-
+
/**
* 查找敏感词,返回找到的所有敏感词
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
- *
- * @param bean 对象,会被转为JSON
+ *
+ * @param bean 对象,会被转为JSON
* @param isDensityMatch 是否使用密集匹配原则
- * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
+ * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
* @return 敏感词
*/
- public static List getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch){
+ public static List getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
}
+
+ /**
+ * 敏感词过滤
+ *
+ * @param bean 对象,会被转为JSON
+ * @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
+ * @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成*
+ * @param bean的class类型
+ * @return 敏感词过滤处理后的bean对象
+ */
+ public static T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
+ String jsonText = JSONUtil.toJsonStr(bean);
+ Class c = (Class) bean.getClass();
+ return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
+ }
+
+ /**
+ * @param text 文本
+ * @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
+ * @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成*
+ * @return 敏感词过滤处理后的文本
+ */
+ public static String sensitiveFilter(String text, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
+ if (null == text || text.trim().equals("")) {
+ return text;
+ }
+ //敏感词过滤场景下,不需要密集匹配
+ List foundWordList = getFindedAllSensitive(text, false, isGreedMatch);
+ if (CollectionUtil.isEmpty(foundWordList)) {
+ return text;
+ }
+ sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
+ } : sensitiveProcessor;
+ Map foundWordMap = new HashMap<>(foundWordList.size());
+ foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord));
+ int length = text.length();
+ StringBuilder textStringBuilder = new StringBuilder();
+ for (int i = 0; i < length; i++) {
+ FoundWord fw = foundWordMap.get(i);
+ if (fw != null) {
+ textStringBuilder.append(sensitiveProcessor.process(fw));
+ i = fw.getEndIndex();
+ } else {
+ textStringBuilder.append(text.charAt(i));
+ }
+ }
+ return textStringBuilder.toString();
+ }
}
diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
index 0d715b338..4e05657b9 100644
--- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
+++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java
@@ -5,12 +5,7 @@ import cn.hutool.core.lang.Filter;
import cn.hutool.core.text.StrBuilder;
import cn.hutool.core.util.StrUtil;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
/**
* DFA(Deterministic Finite Automaton 确定有穷自动机)
@@ -140,11 +135,11 @@ public class WordTree extends HashMap {
* @param text 被检查的文本
* @return 匹配到的关键字
*/
- public String match(String text) {
+ public FoundWord match(String text) {
if (null == text) {
return null;
}
- List matchAll = matchAll(text, 1);
+ List matchAll = matchAll(text, 1);
if (CollectionUtil.isNotEmpty(matchAll)) {
return matchAll.get(0);
}
@@ -159,7 +154,7 @@ public class WordTree extends HashMap {
* @param text 被检查的文本
* @return 匹配的词列表
*/
- public List matchAll(String text) {
+ public List matchAll(String text) {
return matchAll(text, -1);
}
@@ -170,7 +165,7 @@ public class WordTree extends HashMap {
* @param limit 限制匹配个数
* @return 匹配的词列表
*/
- public List matchAll(String text, int limit) {
+ public List matchAll(String text, int limit) {
return matchAll(text, limit, false, false);
}
@@ -185,20 +180,22 @@ public class WordTree extends HashMap {
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
* @return 匹配的词列表
*/
- public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
+ public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
if (null == text) {
return null;
}
- List foundWords = new ArrayList<>();
+ List foundWords = new ArrayList<>();
WordTree current = this;
int length = text.length();
final Filter charFilter = this.charFilter;
//存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空
final StrBuilder wordBuffer = StrUtil.strBuilder();
+ final StrBuilder keyBuffer = StrUtil.strBuilder();
char currentChar;
for (int i = 0; i < length; i++) {
wordBuffer.reset();
+ keyBuffer.reset();
for (int j = i; j < length; j++) {
currentChar = text.charAt(j);
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
@@ -216,9 +213,10 @@ public class WordTree extends HashMap {
break;
}
wordBuffer.append(currentChar);
+ keyBuffer.append(currentChar);
if (current.isEnd(currentChar)) {
//到达单词末尾,关键词成立,从此词的下一个位置开始查找
- foundWords.add(wordBuffer.toString());
+ foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j));
if (limit > 0 && foundWords.size() >= limit) {
//超过匹配限制个数,直接返回
return foundWords;
diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
index 9d500f870..913f10fce 100644
--- a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
+++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java
@@ -1,16 +1,17 @@
package cn.hutool.dfa.test;
-import java.util.List;
-
+import cn.hutool.core.collection.CollectionUtil;
+import cn.hutool.dfa.FoundWord;
+import cn.hutool.dfa.WordTree;
import org.junit.Assert;
import org.junit.Test;
-import cn.hutool.core.collection.CollectionUtil;
-import cn.hutool.dfa.WordTree;
+import java.util.List;
+import java.util.stream.Collectors;
/**
* DFA单元测试
- *
+ *
* @author Looly
*
*/
@@ -28,8 +29,8 @@ public class DfaTest {
// 情况一:标准匹配,匹配到最短关键词,并跳过已经匹配的关键词
// 匹配到【大】,就不再继续匹配了,因此【大土豆】不匹配
// 匹配到【刚出锅】,就跳过这三个字了,因此【出锅】不匹配(由于刚首先被匹配,因此长的被匹配,最短匹配只针对第一个字相同选最短)
- List matchAll = tree.matchAll(text, -1, false, false);
- Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅"));
+ List matchAll = tree.matchAll(text, -1, false, false);
+ Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅"));
}
/**
@@ -44,8 +45,8 @@ public class DfaTest {
// 情况二:匹配到最短关键词,不跳过已经匹配的关键词
// 【大】被匹配,最短匹配原则【大土豆】被跳过,【土豆继续被匹配】
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
- List matchAll = tree.matchAll(text, -1, true, false);
- Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅"));
+ List matchAll = tree.matchAll(text, -1, true, false);
+ Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅"));
}
/**
@@ -60,8 +61,8 @@ public class DfaTest {
// 情况三:匹配到最长关键词,跳过已经匹配的关键词
// 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配
// 由于【大土豆】被匹配,【土豆】被跳过,由于【刚出锅】被匹配,【出锅】被跳过
- List matchAll = tree.matchAll(text, -1, false, true);
- Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "刚出锅"));
+ List matchAll = tree.matchAll(text, -1, false, true);
+ Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "大土^豆", "刚出锅"));
}
@@ -77,8 +78,8 @@ public class DfaTest {
// 情况四:匹配到最长关键词,不跳过已经匹配的关键词(最全关键词)
// 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
- List matchAll = tree.matchAll(text, -1, true, true);
- Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅"));
+ List matchAll = tree.matchAll(text, -1, true, true);
+ Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅"));
}
@@ -90,23 +91,24 @@ public class DfaTest {
WordTree tree = new WordTree();
tree.addWord("tio");
- List all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
- Assert.assertEquals(all, CollectionUtil.newArrayList("t-io"));
+ List all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
+ Assert.assertEquals(all.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("t-io"));
}
@Test
- public void aTest(){
+ public void aTest() {
WordTree tree = new WordTree();
tree.addWord("women");
String text = "a WOMEN todo.".toLowerCase();
- List matchAll = tree.matchAll(text, -1, false, false);
- Assert.assertEquals("[women]", matchAll.toString());
+ List matchAll = tree.matchAll(text, -1, false, false);
+ Assert.assertEquals("[women]", matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()).toString());
}
-
+
// ----------------------------------------------------------------------------------------------------------
+
/**
* 构建查找树
- *
+ *
* @return 查找树
*/
private WordTree buildWordTree() {
diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java
new file mode 100644
index 000000000..8a19fef3e
--- /dev/null
+++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java
@@ -0,0 +1,49 @@
+package cn.hutool.dfa.test;
+
+import cn.hutool.dfa.SensitiveUtil;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class SensitiveUtilTest {
+
+ @Test
+ public void testSensitiveFilter() {
+ List wordList = new ArrayList<>();
+ wordList.add("大");
+ wordList.add("大土豆");
+ wordList.add("土豆");
+ wordList.add("刚出锅");
+ wordList.add("出锅");
+ TestBean bean = new TestBean();
+ bean.setStr("我有一颗$大土^豆,刚出锅的");
+ bean.setNum(100);
+ SensitiveUtil.init(wordList);
+ bean = SensitiveUtil.sensitiveFilter(bean, true, null);
+ Assert.assertEquals(bean.getStr(), "我有一颗$****,***的");
+ }
+
+ public static class TestBean {
+ private String str;
+ private Integer num;
+
+ public String getStr() {
+ return str;
+ }
+
+ public void setStr(String str) {
+ this.str = str;
+ }
+
+ public Integer getNum() {
+ return num;
+ }
+
+ public void setNum(Integer num) {
+ this.num = num;
+ }
+ }
+
+}