Merge branch 'v5-master' into v5-dev

This commit is contained in:
haibinxiao 2020-12-06 19:29:56 +08:00
commit 2cbe4af68a
6 changed files with 257 additions and 75 deletions

View File

@ -0,0 +1,50 @@
package cn.hutool.dfa;
/**
* @author 肖海斌
* @Date 2020-12-05
* <p>
* 匹配到的敏感词包含敏感词text中匹配敏感词的内容以及匹配内容在text中的下标
* 下标可以用来做敏感词的进一步处理如果替换成**
*/
public class FoundWord {
/**
* 生效的敏感词
*/
private String word;
/**
* 敏感词匹配到的内容
*/
private String foundWord;
/**
* 匹配内容在待分析字符串中的开始位置
*/
private int startIndex;
/**
* 匹配内容在待分析字符串中的结束位置
*/
private int endIndex;
public FoundWord(String word, String foundWord, int start, int end) {
this.word = word;
this.foundWord = foundWord;
this.startIndex = start;
this.endIndex = end;
}
public String getWord() {
return word;
}
public String getFoundWord() {
return foundWord;
}
public int getStartIndex() {
return startIndex;
}
public int getEndIndex() {
return endIndex;
}
}

View File

@ -0,0 +1,23 @@
package cn.hutool.dfa;
/**
* @author 肖海斌
* @Date 2020-12-05
* 敏感词过滤处理器默认按字符数替换成*
*/
public interface SensitiveProcessor {
/**
* 敏感词过滤处理
* @param foundWord 敏感词匹配到的内容
* @return 敏感词过滤后的内容默认按字符数替换成*
*/
default String process(FoundWord foundWord) {
int length = foundWord.getFoundWord().length();
StringBuilder sb = new StringBuilder(length);
for (int i = 0; i < length; i++) {
sb.append("*");
}
return sb.toString();
}
}

View File

@ -1,17 +1,20 @@
package cn.hutool.dfa; package cn.hutool.dfa;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Filter; import cn.hutool.core.lang.Filter;
import cn.hutool.core.thread.ThreadUtil; import cn.hutool.core.thread.ThreadUtil;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import cn.hutool.json.JSONUtil; import cn.hutool.json.JSONUtil;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* 敏感词工具类 * 敏感词工具类
* @author Looly
* *
* @author Looly
*/ */
public final class SensitiveUtil { public final class SensitiveUtil {
@ -21,31 +24,33 @@ public final class SensitiveUtil {
/** /**
* @return 是否已经被初始化 * @return 是否已经被初始化
*/ */
public static boolean isInited(){ public static boolean isInited() {
return !sensitiveTree.isEmpty(); return !sensitiveTree.isEmpty();
} }
/** /**
* 初始化敏感词树 * 初始化敏感词树
*
* @param isAsync 是否异步初始化 * @param isAsync 是否异步初始化
* @param sensitiveWords 敏感词列表 * @param sensitiveWords 敏感词列表
*/ */
public static void init(final Collection<String> sensitiveWords, boolean isAsync){ public static void init(final Collection<String> sensitiveWords, boolean isAsync) {
if(isAsync){ if (isAsync) {
ThreadUtil.execAsync(() -> { ThreadUtil.execAsync(() -> {
init(sensitiveWords); init(sensitiveWords);
return true; return true;
}); });
}else{ } else {
init(sensitiveWords); init(sensitiveWords);
} }
} }
/** /**
* 初始化敏感词树 * 初始化敏感词树
*
* @param sensitiveWords 敏感词列表 * @param sensitiveWords 敏感词列表
*/ */
public static void init(Collection<String> sensitiveWords){ public static void init(Collection<String> sensitiveWords) {
sensitiveTree.clear(); sensitiveTree.clear();
sensitiveTree.addWords(sensitiveWords); sensitiveTree.addWords(sensitiveWords);
// log.debug("Sensitive init finished, sensitives: {}", sensitiveWords); // log.debug("Sensitive init finished, sensitives: {}", sensitiveWords);
@ -53,22 +58,24 @@ public final class SensitiveUtil {
/** /**
* 初始化敏感词树 * 初始化敏感词树
*
* @param sensitiveWords 敏感词列表组成的字符串 * @param sensitiveWords 敏感词列表组成的字符串
* @param isAsync 是否异步初始化 * @param isAsync 是否异步初始化
* @param separator 分隔符 * @param separator 分隔符
*/ */
public static void init(String sensitiveWords, char separator, boolean isAsync){ public static void init(String sensitiveWords, char separator, boolean isAsync) {
if(StrUtil.isNotBlank(sensitiveWords)){ if (StrUtil.isNotBlank(sensitiveWords)) {
init(StrUtil.split(sensitiveWords, separator), isAsync); init(StrUtil.split(sensitiveWords, separator), isAsync);
} }
} }
/** /**
* 初始化敏感词树使用逗号分隔每个单词 * 初始化敏感词树使用逗号分隔每个单词
*
* @param sensitiveWords 敏感词列表组成的字符串 * @param sensitiveWords 敏感词列表组成的字符串
* @param isAsync 是否异步初始化 * @param isAsync 是否异步初始化
*/ */
public static void init(String sensitiveWords, boolean isAsync){ public static void init(String sensitiveWords, boolean isAsync) {
init(sensitiveWords, DEFAULT_SEPARATOR, isAsync); init(sensitiveWords, DEFAULT_SEPARATOR, isAsync);
} }
@ -80,53 +87,58 @@ public final class SensitiveUtil {
* @since 5.4.4 * @since 5.4.4
*/ */
public static void setCharFilter(Filter<Character> charFilter) { public static void setCharFilter(Filter<Character> charFilter) {
if(charFilter != null) { if (charFilter != null) {
sensitiveTree.setCharFilter(charFilter); sensitiveTree.setCharFilter(charFilter);
} }
} }
/** /**
* 是否包含敏感词 * 是否包含敏感词
*
* @param text 文本 * @param text 文本
* @return 是否包含 * @return 是否包含
*/ */
public static boolean containsSensitive(String text){ public static boolean containsSensitive(String text) {
return sensitiveTree.isMatch(text); return sensitiveTree.isMatch(text);
} }
/** /**
* 是否包含敏感词 * 是否包含敏感词
*
* @param obj bean会被转为JSON字符串 * @param obj bean会被转为JSON字符串
* @return 是否包含 * @return 是否包含
*/ */
public static boolean containsSensitive(Object obj){ public static boolean containsSensitive(Object obj) {
return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj)); return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj));
} }
/** /**
* 查找敏感词返回找到的第一个敏感词 * 查找敏感词返回找到的第一个敏感词
*
* @param text 文本 * @param text 文本
* @return 敏感词 * @return 敏感词
*/ */
public static String getFindedFirstSensitive(String text){ public static FoundWord getFindedFirstSensitive(String text) {
return sensitiveTree.match(text); return sensitiveTree.match(text);
} }
/** /**
* 查找敏感词返回找到的第一个敏感词 * 查找敏感词返回找到的第一个敏感词
*
* @param obj bean会被转为JSON字符串 * @param obj bean会被转为JSON字符串
* @return 敏感词 * @return 敏感词
*/ */
public static String getFindedFirstSensitive(Object obj){ public static FoundWord getFindedFirstSensitive(Object obj) {
return sensitiveTree.match(JSONUtil.toJsonStr(obj)); return sensitiveTree.match(JSONUtil.toJsonStr(obj));
} }
/** /**
* 查找敏感词返回找到的所有敏感词 * 查找敏感词返回找到的所有敏感词
*
* @param text 文本 * @param text 文本
* @return 敏感词 * @return 敏感词
*/ */
public static List<String> getFindedAllSensitive(String text){ public static List<FoundWord> getFindedAllSensitive(String text) {
return sensitiveTree.matchAll(text); return sensitiveTree.matchAll(text);
} }
@ -140,16 +152,17 @@ public final class SensitiveUtil {
* @param isGreedMatch 是否使用贪婪匹配最长匹配原则 * @param isGreedMatch 是否使用贪婪匹配最长匹配原则
* @return 敏感词 * @return 敏感词
*/ */
public static List<String> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch){ public static List<FoundWord> getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) {
return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch); return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch);
} }
/** /**
* 查找敏感词返回找到的所有敏感词 * 查找敏感词返回找到的所有敏感词
*
* @param bean 对象会被转为JSON * @param bean 对象会被转为JSON
* @return 敏感词 * @return 敏感词
*/ */
public static List<String> getFindedAllSensitive(Object bean){ public static List<FoundWord> getFindedAllSensitive(Object bean) {
return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean)); return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean));
} }
@ -163,7 +176,55 @@ public final class SensitiveUtil {
* @param isGreedMatch 是否使用贪婪匹配最长匹配原则 * @param isGreedMatch 是否使用贪婪匹配最长匹配原则
* @return 敏感词 * @return 敏感词
*/ */
public static List<String> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch){ public static List<FoundWord> getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) {
return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch); return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch);
} }
/**
* 敏感词过滤
*
* @param bean 对象会被转为JSON
* @param isGreedMatch 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab]
* @param sensitiveProcessor 敏感词处理器默认按匹配内容的字符数替换成*
* @param <T> bean的class类型
* @return 敏感词过滤处理后的bean对象
*/
public static <T> T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() {
} : sensitiveProcessor;
String jsonText = JSONUtil.toJsonStr(bean);
Class<T> c = (Class) bean.getClass();
return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c);
}
/**
* @param text 文本
* @param isGreedMatch 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab]
* @param sensitiveProcessor 敏感词处理器默认按匹配内容的字符数替换成*
* @return 敏感词过滤处理后的文本
*/
public static String sensitiveFilter(String text, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) {
if (null == text || text.trim().equals("")) {
return text;
}
//敏感词过滤场景下不需要密集匹配
List<FoundWord> foundWordList = getFindedAllSensitive(text, false, isGreedMatch);
if (CollectionUtil.isEmpty(foundWordList)) {
return text;
}
Map<Integer, FoundWord> foundWordMap = new HashMap<>(foundWordList.size());
foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord));
int length = text.length();
StringBuilder textStringBuilder = new StringBuilder();
for (int i = 0; i < length; i++) {
FoundWord fw = foundWordMap.get(i);
if (fw != null) {
textStringBuilder.append(sensitiveProcessor.process(fw));
i = fw.getEndIndex();
} else {
textStringBuilder.append(text.charAt(i));
}
}
return textStringBuilder.toString();
}
} }

View File

@ -5,12 +5,7 @@ import cn.hutool.core.lang.Filter;
import cn.hutool.core.text.StrBuilder; import cn.hutool.core.text.StrBuilder;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
import java.util.ArrayList; import java.util.*;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/** /**
* DFADeterministic Finite Automaton 确定有穷自动机 * DFADeterministic Finite Automaton 确定有穷自动机
@ -140,11 +135,11 @@ public class WordTree extends HashMap<Character, WordTree> {
* @param text 被检查的文本 * @param text 被检查的文本
* @return 匹配到的关键字 * @return 匹配到的关键字
*/ */
public String match(String text) { public FoundWord match(String text) {
if (null == text) { if (null == text) {
return null; return null;
} }
List<String> matchAll = matchAll(text, 1); List<FoundWord> matchAll = matchAll(text, 1);
if (CollectionUtil.isNotEmpty(matchAll)) { if (CollectionUtil.isNotEmpty(matchAll)) {
return matchAll.get(0); return matchAll.get(0);
} }
@ -159,7 +154,7 @@ public class WordTree extends HashMap<Character, WordTree> {
* @param text 被检查的文本 * @param text 被检查的文本
* @return 匹配的词列表 * @return 匹配的词列表
*/ */
public List<String> matchAll(String text) { public List<FoundWord> matchAll(String text) {
return matchAll(text, -1); return matchAll(text, -1);
} }
@ -170,7 +165,7 @@ public class WordTree extends HashMap<Character, WordTree> {
* @param limit 限制匹配个数 * @param limit 限制匹配个数
* @return 匹配的词列表 * @return 匹配的词列表
*/ */
public List<String> matchAll(String text, int limit) { public List<FoundWord> matchAll(String text, int limit) {
return matchAll(text, limit, false, false); return matchAll(text, limit, false, false);
} }
@ -185,20 +180,22 @@ public class WordTree extends HashMap<Character, WordTree> {
* @param isGreedMatch 是否使用贪婪匹配最长匹配原则 * @param isGreedMatch 是否使用贪婪匹配最长匹配原则
* @return 匹配的词列表 * @return 匹配的词列表
*/ */
public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) { public List<FoundWord> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
if (null == text) { if (null == text) {
return null; return null;
} }
List<String> foundWords = new ArrayList<>(); List<FoundWord> foundWords = new ArrayList<>();
WordTree current = this; WordTree current = this;
int length = text.length(); int length = text.length();
final Filter<Character> charFilter = this.charFilter; final Filter<Character> charFilter = this.charFilter;
//存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空 //存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空
final StrBuilder wordBuffer = StrUtil.strBuilder(); final StrBuilder wordBuffer = StrUtil.strBuilder();
final StrBuilder keyBuffer = StrUtil.strBuilder();
char currentChar; char currentChar;
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
wordBuffer.reset(); wordBuffer.reset();
keyBuffer.reset();
for (int j = i; j < length; j++) { for (int j = i; j < length; j++) {
currentChar = text.charAt(j); currentChar = text.charAt(j);
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar); // Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
@ -216,9 +213,10 @@ public class WordTree extends HashMap<Character, WordTree> {
break; break;
} }
wordBuffer.append(currentChar); wordBuffer.append(currentChar);
keyBuffer.append(currentChar);
if (current.isEnd(currentChar)) { if (current.isEnd(currentChar)) {
//到达单词末尾关键词成立从此词的下一个位置开始查找 //到达单词末尾关键词成立从此词的下一个位置开始查找
foundWords.add(wordBuffer.toString()); foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j));
if (limit > 0 && foundWords.size() >= limit) { if (limit > 0 && foundWords.size() >= limit) {
//超过匹配限制个数直接返回 //超过匹配限制个数直接返回
return foundWords; return foundWords;

View File

@ -1,12 +1,12 @@
package cn.hutool.dfa.test; package cn.hutool.dfa.test;
import java.util.List; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.dfa.FoundWord;
import cn.hutool.dfa.WordTree;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import cn.hutool.core.collection.CollectionUtil; import java.util.List;
import cn.hutool.dfa.WordTree;
/** /**
* DFA单元测试 * DFA单元测试
@ -28,8 +28,8 @@ public class DfaTest {
// 情况一标准匹配匹配到最短关键词并跳过已经匹配的关键词 // 情况一标准匹配匹配到最短关键词并跳过已经匹配的关键词
// 匹配到就不再继续匹配了因此大土豆不匹配 // 匹配到就不再继续匹配了因此大土豆不匹配
// 匹配到刚出锅就跳过这三个字了因此出锅不匹配由于刚首先被匹配因此长的被匹配最短匹配只针对第一个字相同选最短 // 匹配到刚出锅就跳过这三个字了因此出锅不匹配由于刚首先被匹配因此长的被匹配最短匹配只针对第一个字相同选最短
List<String> matchAll = tree.matchAll(text, -1, false, false); List<FoundWord> matchAll = tree.matchAll(text, -1, false, false);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "土^豆", "刚出锅")); Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("", "土^豆", "刚出锅"));
} }
/** /**
@ -44,8 +44,8 @@ public class DfaTest {
// 情况二匹配到最短关键词不跳过已经匹配的关键词 // 情况二匹配到最短关键词不跳过已经匹配的关键词
// 被匹配最短匹配原则大土豆被跳过土豆继续被匹配 // 被匹配最短匹配原则大土豆被跳过土豆继续被匹配
// 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配 // 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配
List<String> matchAll = tree.matchAll(text, -1, true, false); List<FoundWord> matchAll = tree.matchAll(text, -1, true, false);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "土^豆", "刚出锅", "出锅")); Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("", "土^豆", "刚出锅", "出锅"));
} }
/** /**
@ -60,8 +60,8 @@ public class DfaTest {
// 情况三匹配到最长关键词跳过已经匹配的关键词 // 情况三匹配到最长关键词跳过已经匹配的关键词
// 匹配到由于到最长匹配因此大土豆接着被匹配 // 匹配到由于到最长匹配因此大土豆接着被匹配
// 由于大土豆被匹配土豆被跳过由于刚出锅被匹配出锅被跳过 // 由于大土豆被匹配土豆被跳过由于刚出锅被匹配出锅被跳过
List<String> matchAll = tree.matchAll(text, -1, false, true); List<FoundWord> matchAll = tree.matchAll(text, -1, false, true);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "大土^豆", "刚出锅")); Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("", "大土^豆", "刚出锅"));
} }
@ -77,8 +77,8 @@ public class DfaTest {
// 情况四匹配到最长关键词不跳过已经匹配的关键词最全关键词 // 情况四匹配到最长关键词不跳过已经匹配的关键词最全关键词
// 匹配到由于到最长匹配因此大土豆接着被匹配由于不跳过已经匹配的关键词土豆继续被匹配 // 匹配到由于到最长匹配因此大土豆接着被匹配由于不跳过已经匹配的关键词土豆继续被匹配
// 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配 // 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配
List<String> matchAll = tree.matchAll(text, -1, true, true); List<FoundWord> matchAll = tree.matchAll(text, -1, true, true);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "大土^豆", "土^豆", "刚出锅", "出锅")); Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("", "大土^豆", "土^豆", "刚出锅", "出锅"));
} }
@ -90,20 +90,21 @@ public class DfaTest {
WordTree tree = new WordTree(); WordTree tree = new WordTree();
tree.addWord("tio"); tree.addWord("tio");
List<String> all = tree.matchAll("AAAAAAAt-ioBBBBBBB"); List<FoundWord> all = tree.matchAll("AAAAAAAt-ioBBBBBBB");
Assert.assertEquals(all, CollectionUtil.newArrayList("t-io")); Assert.assertEquals(all.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("t-io"));
} }
@Test @Test
public void aTest(){ public void aTest() {
WordTree tree = new WordTree(); WordTree tree = new WordTree();
tree.addWord("women"); tree.addWord("women");
String text = "a WOMEN todo.".toLowerCase(); String text = "a WOMEN todo.".toLowerCase();
List<String> matchAll = tree.matchAll(text, -1, false, false); List<FoundWord> matchAll = tree.matchAll(text, -1, false, false);
Assert.assertEquals("[women]", matchAll.toString()); Assert.assertEquals("[women]", matchAll.stream().map(fw -> fw.getFoundWord()).toString());
} }
// ---------------------------------------------------------------------------------------------------------- // ----------------------------------------------------------------------------------------------------------
/** /**
* 构建查找树 * 构建查找树
* *

View File

@ -0,0 +1,49 @@
package cn.hutool.dfa.test;
import cn.hutool.dfa.SensitiveUtil;
import org.junit.Assert;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
public class SensitiveUtilTest {
@Test
public void testSensitiveFilter() {
List<String> wordList = new ArrayList<>();
wordList.add("");
wordList.add("大土豆");
wordList.add("土豆");
wordList.add("刚出锅");
wordList.add("出锅");
TestBean bean = new TestBean();
bean.setStr("我有一颗$大土^豆,刚出锅的");
bean.setNum(100);
SensitiveUtil.init(wordList);
bean = SensitiveUtil.sensitiveFilter(bean, true, null);
Assert.assertEquals(bean.getStr(), "我有一颗$*******的");
}
public static class TestBean {
private String str;
private Integer num;
public String getStr() {
return str;
}
public void setStr(String str) {
this.str = str;
}
public Integer getNum() {
return num;
}
public void setNum(Integer num) {
this.num = num;
}
}
}