From 824aaa2618baa2ffc89634d3539a5729d7fd5494 Mon Sep 17 00:00:00 2001 From: haibinxiao Date: Sun, 6 Dec 2020 19:27:02 +0800 Subject: [PATCH 1/4] =?UTF-8?q?=E6=95=8F=E6=84=9F=E8=AF=8D=E8=BF=87?= =?UTF-8?q?=E6=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../main/java/cn/hutool/dfa/FoundWord.java | 50 ++++++ .../cn/hutool/dfa/SensitiveProcessor.java | 23 +++ .../java/cn/hutool/dfa/SensitiveUtil.java | 145 +++++++++++++----- .../src/main/java/cn/hutool/dfa/WordTree.java | 24 ++- .../test/java/cn/hutool/dfa/test/DfaTest.java | 41 ++--- .../cn/hutool/dfa/test/SensitiveUtilTest.java | 49 ++++++ 6 files changed, 257 insertions(+), 75 deletions(-) create mode 100644 hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java create mode 100644 hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java create mode 100644 hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java new file mode 100644 index 000000000..b24fc2232 --- /dev/null +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java @@ -0,0 +1,50 @@ +package cn.hutool.dfa; + +/** + * @author 肖海斌 + * @Date 2020-12-05 + *

+ * 匹配到的敏感词,包含敏感词,text中匹配敏感词的内容,以及匹配内容在text中的下标, + * 下标可以用来做敏感词的进一步处理,如果替换成** + */ +public class FoundWord { + /** + * 生效的敏感词 + */ + private String word; + /** + * 敏感词匹配到的内容 + */ + private String foundWord; + /** + * 匹配内容在待分析字符串中的开始位置 + */ + private int startIndex; + /** + * 匹配内容在待分析字符串中的结束位置 + */ + private int endIndex; + + public FoundWord(String word, String foundWord, int start, int end) { + this.word = word; + this.foundWord = foundWord; + this.startIndex = start; + this.endIndex = end; + } + + public String getWord() { + return word; + } + + public String getFoundWord() { + return foundWord; + } + + public int getStartIndex() { + return startIndex; + } + + public int getEndIndex() { + return endIndex; + } +} diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java new file mode 100644 index 000000000..e8a1e8509 --- /dev/null +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java @@ -0,0 +1,23 @@ +package cn.hutool.dfa; + +/** + * @author 肖海斌 + * @Date 2020-12-05 + * 敏感词过滤处理器,默认按字符数替换成* + */ +public interface SensitiveProcessor { + + /** + * 敏感词过滤处理 + * @param foundWord 敏感词匹配到的内容 + * @return 敏感词过滤后的内容,默认按字符数替换成* + */ + default String process(FoundWord foundWord) { + int length = foundWord.getFoundWord().length(); + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append("*"); + } + return sb.toString(); + } +} diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java index d64100494..67244a4ad 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java @@ -1,77 +1,84 @@ package cn.hutool.dfa; +import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.lang.Filter; import cn.hutool.core.thread.ThreadUtil; import cn.hutool.core.util.StrUtil; import cn.hutool.json.JSONUtil; import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** * 敏感词工具类 - * @author Looly * + * @author Looly */ public final class SensitiveUtil { public static final char DEFAULT_SEPARATOR = StrUtil.C_COMMA; private static final WordTree sensitiveTree = new WordTree(); - + /** * @return 是否已经被初始化 */ - public static boolean isInited(){ + public static boolean isInited() { return !sensitiveTree.isEmpty(); } - + /** * 初始化敏感词树 - * @param isAsync 是否异步初始化 + * + * @param isAsync 是否异步初始化 * @param sensitiveWords 敏感词列表 */ - public static void init(final Collection sensitiveWords, boolean isAsync){ - if(isAsync){ + public static void init(final Collection sensitiveWords, boolean isAsync) { + if (isAsync) { ThreadUtil.execAsync(() -> { init(sensitiveWords); return true; }); - }else{ + } else { init(sensitiveWords); } } - + /** * 初始化敏感词树 + * * @param sensitiveWords 敏感词列表 */ - public static void init(Collection sensitiveWords){ + public static void init(Collection sensitiveWords) { sensitiveTree.clear(); sensitiveTree.addWords(sensitiveWords); // log.debug("Sensitive init finished, sensitives: {}", sensitiveWords); } - + /** * 初始化敏感词树 + * * @param sensitiveWords 敏感词列表组成的字符串 - * @param isAsync 是否异步初始化 - * @param separator 分隔符 + * @param isAsync 是否异步初始化 + * @param separator 分隔符 */ - public static void init(String sensitiveWords, char separator, boolean isAsync){ - if(StrUtil.isNotBlank(sensitiveWords)){ + public static void init(String sensitiveWords, char separator, boolean isAsync) { + if (StrUtil.isNotBlank(sensitiveWords)) { init(StrUtil.split(sensitiveWords, separator), isAsync); } } - + /** * 初始化敏感词树,使用逗号分隔每个单词 + * * @param sensitiveWords 敏感词列表组成的字符串 - * @param isAsync 是否异步初始化 + * @param isAsync 是否异步初始化 */ - public static void init(String sensitiveWords, boolean isAsync){ + public static void init(String sensitiveWords, boolean isAsync) { init(sensitiveWords, DEFAULT_SEPARATOR, isAsync); } - + /** * 设置字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符
* 当accept为false时,此字符不参与匹配 @@ -80,90 +87,144 @@ public final class SensitiveUtil { * @since 5.4.4 */ public static void setCharFilter(Filter charFilter) { - if(charFilter != null) { + if (charFilter != null) { sensitiveTree.setCharFilter(charFilter); } } - + /** * 是否包含敏感词 + * * @param text 文本 * @return 是否包含 */ - public static boolean containsSensitive(String text){ + public static boolean containsSensitive(String text) { return sensitiveTree.isMatch(text); } - + /** * 是否包含敏感词 + * * @param obj bean,会被转为JSON字符串 * @return 是否包含 */ - public static boolean containsSensitive(Object obj){ + public static boolean containsSensitive(Object obj) { return sensitiveTree.isMatch(JSONUtil.toJsonStr(obj)); } - + /** * 查找敏感词,返回找到的第一个敏感词 + * * @param text 文本 * @return 敏感词 */ - public static String getFindedFirstSensitive(String text){ + public static FoundWord getFindedFirstSensitive(String text) { return sensitiveTree.match(text); } - + /** * 查找敏感词,返回找到的第一个敏感词 + * * @param obj bean,会被转为JSON字符串 * @return 敏感词 */ - public static String getFindedFirstSensitive(Object obj){ + public static FoundWord getFindedFirstSensitive(Object obj) { return sensitiveTree.match(JSONUtil.toJsonStr(obj)); } - + /** * 查找敏感词,返回找到的所有敏感词 + * * @param text 文本 * @return 敏感词 */ - public static List getFindedAllSensitive(String text){ + public static List getFindedAllSensitive(String text) { return sensitiveTree.matchAll(text); } - + /** * 查找敏感词,返回找到的所有敏感词
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab] - * - * @param text 文本 + * + * @param text 文本 * @param isDensityMatch 是否使用密集匹配原则 - * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 + * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 * @return 敏感词 */ - public static List getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch){ + public static List getFindedAllSensitive(String text, boolean isDensityMatch, boolean isGreedMatch) { return sensitiveTree.matchAll(text, -1, isDensityMatch, isGreedMatch); } - + /** * 查找敏感词,返回找到的所有敏感词 + * * @param bean 对象,会被转为JSON * @return 敏感词 */ - public static List getFindedAllSensitive(Object bean){ + public static List getFindedAllSensitive(Object bean) { return sensitiveTree.matchAll(JSONUtil.toJsonStr(bean)); } - + /** * 查找敏感词,返回找到的所有敏感词
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab] - * - * @param bean 对象,会被转为JSON + * + * @param bean 对象,会被转为JSON * @param isDensityMatch 是否使用密集匹配原则 - * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 + * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 * @return 敏感词 */ - public static List getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch){ + public static List getFindedAllSensitive(Object bean, boolean isDensityMatch, boolean isGreedMatch) { return getFindedAllSensitive(JSONUtil.toJsonStr(bean), isDensityMatch, isGreedMatch); } + + /** + * 敏感词过滤 + * + * @param bean 对象,会被转为JSON + * @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab] + * @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成* + * @param bean的class类型 + * @return 敏感词过滤处理后的bean对象 + */ + public static T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) { + sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() { + } : sensitiveProcessor; + String jsonText = JSONUtil.toJsonStr(bean); + Class c = (Class) bean.getClass(); + return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c); + } + + /** + * @param text 文本 + * @param isGreedMatch 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab] + * @param sensitiveProcessor 敏感词处理器,默认按匹配内容的字符数替换成* + * @return 敏感词过滤处理后的文本 + */ + public static String sensitiveFilter(String text, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) { + if (null == text || text.trim().equals("")) { + return text; + } + //敏感词过滤场景下,不需要密集匹配 + List foundWordList = getFindedAllSensitive(text, false, isGreedMatch); + if (CollectionUtil.isEmpty(foundWordList)) { + return text; + } + Map foundWordMap = new HashMap<>(foundWordList.size()); + foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord)); + int length = text.length(); + StringBuilder textStringBuilder = new StringBuilder(); + for (int i = 0; i < length; i++) { + FoundWord fw = foundWordMap.get(i); + if (fw != null) { + textStringBuilder.append(sensitiveProcessor.process(fw)); + i = fw.getEndIndex(); + } else { + textStringBuilder.append(text.charAt(i)); + } + } + return textStringBuilder.toString(); + } } diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java index 0d715b338..4e05657b9 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java @@ -5,12 +5,7 @@ import cn.hutool.core.lang.Filter; import cn.hutool.core.text.StrBuilder; import cn.hutool.core.util.StrUtil; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; /** * DFA(Deterministic Finite Automaton 确定有穷自动机) @@ -140,11 +135,11 @@ public class WordTree extends HashMap { * @param text 被检查的文本 * @return 匹配到的关键字 */ - public String match(String text) { + public FoundWord match(String text) { if (null == text) { return null; } - List matchAll = matchAll(text, 1); + List matchAll = matchAll(text, 1); if (CollectionUtil.isNotEmpty(matchAll)) { return matchAll.get(0); } @@ -159,7 +154,7 @@ public class WordTree extends HashMap { * @param text 被检查的文本 * @return 匹配的词列表 */ - public List matchAll(String text) { + public List matchAll(String text) { return matchAll(text, -1); } @@ -170,7 +165,7 @@ public class WordTree extends HashMap { * @param limit 限制匹配个数 * @return 匹配的词列表 */ - public List matchAll(String text, int limit) { + public List matchAll(String text, int limit) { return matchAll(text, limit, false, false); } @@ -185,20 +180,22 @@ public class WordTree extends HashMap { * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 * @return 匹配的词列表 */ - public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) { + public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) { if (null == text) { return null; } - List foundWords = new ArrayList<>(); + List foundWords = new ArrayList<>(); WordTree current = this; int length = text.length(); final Filter charFilter = this.charFilter; //存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空 final StrBuilder wordBuffer = StrUtil.strBuilder(); + final StrBuilder keyBuffer = StrUtil.strBuilder(); char currentChar; for (int i = 0; i < length; i++) { wordBuffer.reset(); + keyBuffer.reset(); for (int j = i; j < length; j++) { currentChar = text.charAt(j); // Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar); @@ -216,9 +213,10 @@ public class WordTree extends HashMap { break; } wordBuffer.append(currentChar); + keyBuffer.append(currentChar); if (current.isEnd(currentChar)) { //到达单词末尾,关键词成立,从此词的下一个位置开始查找 - foundWords.add(wordBuffer.toString()); + foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j)); if (limit > 0 && foundWords.size() >= limit) { //超过匹配限制个数,直接返回 return foundWords; diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java index 9d500f870..065b3d0da 100644 --- a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java +++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java @@ -1,16 +1,16 @@ package cn.hutool.dfa.test; -import java.util.List; - +import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.dfa.FoundWord; +import cn.hutool.dfa.WordTree; import org.junit.Assert; import org.junit.Test; -import cn.hutool.core.collection.CollectionUtil; -import cn.hutool.dfa.WordTree; +import java.util.List; /** * DFA单元测试 - * + * * @author Looly * */ @@ -28,8 +28,8 @@ public class DfaTest { // 情况一:标准匹配,匹配到最短关键词,并跳过已经匹配的关键词 // 匹配到【大】,就不再继续匹配了,因此【大土豆】不匹配 // 匹配到【刚出锅】,就跳过这三个字了,因此【出锅】不匹配(由于刚首先被匹配,因此长的被匹配,最短匹配只针对第一个字相同选最短) - List matchAll = tree.matchAll(text, -1, false, false); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅")); + List matchAll = tree.matchAll(text, -1, false, false); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅")); } /** @@ -44,8 +44,8 @@ public class DfaTest { // 情况二:匹配到最短关键词,不跳过已经匹配的关键词 // 【大】被匹配,最短匹配原则【大土豆】被跳过,【土豆继续被匹配】 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 - List matchAll = tree.matchAll(text, -1, true, false); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅")); + List matchAll = tree.matchAll(text, -1, true, false); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅")); } /** @@ -60,8 +60,8 @@ public class DfaTest { // 情况三:匹配到最长关键词,跳过已经匹配的关键词 // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配 // 由于【大土豆】被匹配,【土豆】被跳过,由于【刚出锅】被匹配,【出锅】被跳过 - List matchAll = tree.matchAll(text, -1, false, true); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "刚出锅")); + List matchAll = tree.matchAll(text, -1, false, true); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "大土^豆", "刚出锅")); } @@ -77,8 +77,8 @@ public class DfaTest { // 情况四:匹配到最长关键词,不跳过已经匹配的关键词(最全关键词) // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 - List matchAll = tree.matchAll(text, -1, true, true); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅")); + List matchAll = tree.matchAll(text, -1, true, true); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅")); } @@ -90,23 +90,24 @@ public class DfaTest { WordTree tree = new WordTree(); tree.addWord("tio"); - List all = tree.matchAll("AAAAAAAt-ioBBBBBBB"); - Assert.assertEquals(all, CollectionUtil.newArrayList("t-io")); + List all = tree.matchAll("AAAAAAAt-ioBBBBBBB"); + Assert.assertEquals(all.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("t-io")); } @Test - public void aTest(){ + public void aTest() { WordTree tree = new WordTree(); tree.addWord("women"); String text = "a WOMEN todo.".toLowerCase(); - List matchAll = tree.matchAll(text, -1, false, false); - Assert.assertEquals("[women]", matchAll.toString()); + List matchAll = tree.matchAll(text, -1, false, false); + Assert.assertEquals("[women]", matchAll.stream().map(fw -> fw.getFoundWord()).toString()); } - + // ---------------------------------------------------------------------------------------------------------- + /** * 构建查找树 - * + * * @return 查找树 */ private WordTree buildWordTree() { diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java new file mode 100644 index 000000000..8a19fef3e --- /dev/null +++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/SensitiveUtilTest.java @@ -0,0 +1,49 @@ +package cn.hutool.dfa.test; + +import cn.hutool.dfa.SensitiveUtil; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class SensitiveUtilTest { + + @Test + public void testSensitiveFilter() { + List wordList = new ArrayList<>(); + wordList.add("大"); + wordList.add("大土豆"); + wordList.add("土豆"); + wordList.add("刚出锅"); + wordList.add("出锅"); + TestBean bean = new TestBean(); + bean.setStr("我有一颗$大土^豆,刚出锅的"); + bean.setNum(100); + SensitiveUtil.init(wordList); + bean = SensitiveUtil.sensitiveFilter(bean, true, null); + Assert.assertEquals(bean.getStr(), "我有一颗$****,***的"); + } + + public static class TestBean { + private String str; + private Integer num; + + public String getStr() { + return str; + } + + public void setStr(String str) { + this.str = str; + } + + public Integer getNum() { + return num; + } + + public void setNum(Integer num) { + this.num = num; + } + } + +} From f7c640934d6bf4c91fc11d0291f82f8eb3937f2c Mon Sep 17 00:00:00 2001 From: haibinxiao Date: Sun, 6 Dec 2020 21:29:21 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java | 1 - hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java | 1 - 2 files changed, 2 deletions(-) diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java index b24fc2232..e57a0f9ec 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/FoundWord.java @@ -2,7 +2,6 @@ package cn.hutool.dfa; /** * @author 肖海斌 - * @Date 2020-12-05 *

* 匹配到的敏感词,包含敏感词,text中匹配敏感词的内容,以及匹配内容在text中的下标, * 下标可以用来做敏感词的进一步处理,如果替换成** diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java index e8a1e8509..34c0128b8 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveProcessor.java @@ -2,7 +2,6 @@ package cn.hutool.dfa; /** * @author 肖海斌 - * @Date 2020-12-05 * 敏感词过滤处理器,默认按字符数替换成* */ public interface SensitiveProcessor { From f5c53a8f60c7f51866ab47601bd1cdcdb05822f7 Mon Sep 17 00:00:00 2001 From: haibinxiao Date: Sun, 6 Dec 2020 22:00:12 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E9=94=99=E8=AF=AF=E8=A7=A3=E5=86=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/test/java/cn/hutool/dfa/test/DfaTest.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java index 065b3d0da..913f10fce 100644 --- a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java +++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java @@ -7,6 +7,7 @@ import org.junit.Assert; import org.junit.Test; import java.util.List; +import java.util.stream.Collectors; /** * DFA单元测试 @@ -29,7 +30,7 @@ public class DfaTest { // 匹配到【大】,就不再继续匹配了,因此【大土豆】不匹配 // 匹配到【刚出锅】,就跳过这三个字了,因此【出锅】不匹配(由于刚首先被匹配,因此长的被匹配,最短匹配只针对第一个字相同选最短) List matchAll = tree.matchAll(text, -1, false, false); - Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅")); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅")); } /** @@ -45,7 +46,7 @@ public class DfaTest { // 【大】被匹配,最短匹配原则【大土豆】被跳过,【土豆继续被匹配】 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 List matchAll = tree.matchAll(text, -1, true, false); - Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅")); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅")); } /** @@ -61,7 +62,7 @@ public class DfaTest { // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配 // 由于【大土豆】被匹配,【土豆】被跳过,由于【刚出锅】被匹配,【出锅】被跳过 List matchAll = tree.matchAll(text, -1, false, true); - Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "大土^豆", "刚出锅")); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "大土^豆", "刚出锅")); } @@ -78,7 +79,7 @@ public class DfaTest { // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 List matchAll = tree.matchAll(text, -1, true, true); - Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅")); + Assert.assertEquals(matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅")); } @@ -91,7 +92,7 @@ public class DfaTest { tree.addWord("tio"); List all = tree.matchAll("AAAAAAAt-ioBBBBBBB"); - Assert.assertEquals(all.stream().map(fw -> fw.getFoundWord()), CollectionUtil.newArrayList("t-io")); + Assert.assertEquals(all.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()), CollectionUtil.newArrayList("t-io")); } @Test @@ -100,7 +101,7 @@ public class DfaTest { tree.addWord("women"); String text = "a WOMEN todo.".toLowerCase(); List matchAll = tree.matchAll(text, -1, false, false); - Assert.assertEquals("[women]", matchAll.stream().map(fw -> fw.getFoundWord()).toString()); + Assert.assertEquals("[women]", matchAll.stream().map(fw -> fw.getFoundWord()).collect(Collectors.toList()).toString()); } // ---------------------------------------------------------------------------------------------------------- From 11edc1fcc666537aed6f818405ff4ce5425e428c Mon Sep 17 00:00:00 2001 From: haibinxiao Date: Sun, 6 Dec 2020 23:02:11 +0800 Subject: [PATCH 4/4] =?UTF-8?q?sensitiveProcessor=E8=BF=81=E7=A7=BB?= =?UTF-8?q?=E5=88=B0=E6=AD=A3=E7=A1=AE=E7=9A=84=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java index 67244a4ad..6a396d49d 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java @@ -190,8 +190,6 @@ public final class SensitiveUtil { * @return 敏感词过滤处理后的bean对象 */ public static T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) { - sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() { - } : sensitiveProcessor; String jsonText = JSONUtil.toJsonStr(bean); Class c = (Class) bean.getClass(); return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c); @@ -212,6 +210,8 @@ public final class SensitiveUtil { if (CollectionUtil.isEmpty(foundWordList)) { return text; } + sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() { + } : sensitiveProcessor; Map foundWordMap = new HashMap<>(foundWordList.size()); foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord)); int length = text.length();