From d5916b9998015346cbddb80c52de90fdfe5cad6c Mon Sep 17 00:00:00 2001 From: Looly Date: Mon, 7 Feb 2022 19:04:37 +0800 Subject: [PATCH] fix bug --- CHANGELOG.md | 1 + hutool-dfa/pom.xml | 5 --- .../java/cn/hutool/dfa/SensitiveUtil.java | 19 ++++++++--- .../src/main/java/cn/hutool/dfa/WordTree.java | 12 +++---- .../src/test/java/cn/hutool/dfa/DfaTest.java | 33 ++++++++++++++++--- .../java/cn/hutool/dfa/SensitiveUtilTest.java | 26 ++++++--------- 6 files changed, 60 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ce31bdb6..6686d2456 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * 【core 】 修复ChineseDate农历获取正月出现数组越界BUG(issue#2112@Github) * 【extra 】 修复EmojiUtil.toHtmlHex()方法(pr#519@Gitee) * 【system 】 修复CpuInfo.getUsed()方法(issue#2116@Github) +* 【dfa 】 修复密集匹配和贪婪匹配冲突问题(issue#2126@Github) ------------------------------------------------------------------------------------------------------------- # 5.7.20 (2022-01-20) diff --git a/hutool-dfa/pom.xml b/hutool-dfa/pom.xml index 6416c6f74..3e05431b2 100644 --- a/hutool-dfa/pom.xml +++ b/hutool-dfa/pom.xml @@ -17,11 +17,6 @@ Hutool 基于DFA的关键词查找 - - cn.hutool - hutool-core - ${project.parent.version} - cn.hutool hutool-json diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java index 0c2584690..bcf81a784 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/SensitiveUtil.java @@ -195,11 +195,21 @@ public final class SensitiveUtil { */ public static T sensitiveFilter(T bean, boolean isGreedMatch, SensitiveProcessor sensitiveProcessor) { String jsonText = JSONUtil.toJsonStr(bean); - @SuppressWarnings("unchecked") - final Class c = (Class) bean.getClass(); + @SuppressWarnings("unchecked") final Class c = (Class) bean.getClass(); return JSONUtil.toBean(sensitiveFilter(jsonText, isGreedMatch, sensitiveProcessor), c); } + /** + * 处理过滤文本中的敏感词,默认替换成* + * + * @param text 文本 + * @return 敏感词过滤处理后的文本 + * @since 5.7.21 + */ + public static String sensitiveFilter(String text) { + return sensitiveFilter(text, true, null); + } + /** * 处理过滤文本中的敏感词,默认替换成* * @@ -214,13 +224,14 @@ public final class SensitiveUtil { } //敏感词过滤场景下,不需要密集匹配 - List foundWordList = getFoundAllSensitive(text, false, isGreedMatch); + List foundWordList = getFoundAllSensitive(text, true, isGreedMatch); if (CollUtil.isEmpty(foundWordList)) { return text; } sensitiveProcessor = sensitiveProcessor == null ? new SensitiveProcessor() { } : sensitiveProcessor; - Map foundWordMap = new HashMap<>(foundWordList.size()); + + final Map foundWordMap = new HashMap<>(foundWordList.size(), 1); foundWordList.forEach(foundWord -> foundWordMap.put(foundWord.getStartIndex(), foundWord)); int length = text.length(); StringBuilder textStringBuilder = new StringBuilder(); diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java index 310f5958b..a58371707 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java @@ -3,7 +3,6 @@ package cn.hutool.dfa; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.lang.Filter; -import cn.hutool.core.text.StrBuilder; import cn.hutool.core.util.StrUtil; import java.util.ArrayList; @@ -247,15 +246,15 @@ public class WordTree extends HashMap { List foundWords = new ArrayList<>(); WordTree current = this; - int length = text.length(); + final int length = text.length(); final Filter charFilter = this.charFilter; //存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空 - final StrBuilder wordBuffer = StrUtil.strBuilder(); - final StrBuilder keyBuffer = StrUtil.strBuilder(); + final StringBuilder wordBuffer = StrUtil.builder(); + final StringBuilder keyBuffer = StrUtil.builder(); char currentChar; for (int i = 0; i < length; i++) { - wordBuffer.reset(); - keyBuffer.reset(); + wordBuffer.setLength(0); + keyBuffer.setLength(0); for (int j = i; j < length; j++) { currentChar = text.charAt(j); // Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar); @@ -284,6 +283,7 @@ public class WordTree extends HashMap { if (false == isDensityMatch) { //如果非密度匹配,跳过匹配到的词 i = j; + break; } if (false == isGreedMatch) { //如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配 diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/DfaTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/DfaTest.java index d466f69b7..f185fac31 100644 --- a/hutool-dfa/src/test/java/cn/hutool/dfa/DfaTest.java +++ b/hutool-dfa/src/test/java/cn/hutool/dfa/DfaTest.java @@ -47,7 +47,7 @@ public class DfaTest { } /** - * 贪婪匹配原则测试 + * 贪婪非密集匹配原则测试 */ @Test public void greedMatchTest() { @@ -56,15 +56,15 @@ public class DfaTest { // ----------------------------------------------------------------------------------------------------------------------------------- // 情况三:匹配到最长关键词,跳过已经匹配的关键词 - // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配 - // 由于【大土豆】被匹配,【土豆】被跳过,由于【刚出锅】被匹配,【出锅】被跳过 + // 匹配到【大】,由于非密集匹配,因此从下一个字符开始查找,匹配到【土豆】接着被匹配 + // 由于【刚出锅】被匹配,由于非密集匹配,【出锅】被跳过 List matchAll = tree.matchAll(text, -1, false, true); - Assert.assertEquals(matchAll, CollUtil.newArrayList("大", "大土^豆", "刚出锅")); + Assert.assertEquals(matchAll, CollUtil.newArrayList("大", "土^豆", "刚出锅")); } /** - * 密集匹配原则(最短匹配)和贪婪匹配原则测试 + * 密集匹配原则(最长匹配)和贪婪匹配原则测试 */ @Test public void densityAndGreedMatchTest() { @@ -80,6 +80,29 @@ public class DfaTest { } + @Test + public void densityAndGreedMatchTest2(){ + WordTree tree = new WordTree(); + tree.addWord("赵"); + tree.addWord("赵阿"); + tree.addWord("赵阿三"); + + final List result = tree.matchAllWords("赵阿三在做什么", -1, true, true); + Assert.assertEquals(3, result.size()); + + Assert.assertEquals("赵", result.get(0).getWord()); + Assert.assertEquals(0, result.get(0).getStartIndex().intValue()); + Assert.assertEquals(0, result.get(0).getEndIndex().intValue()); + + Assert.assertEquals("赵阿", result.get(1).getWord()); + Assert.assertEquals(0, result.get(1).getStartIndex().intValue()); + Assert.assertEquals(1, result.get(1).getEndIndex().intValue()); + + Assert.assertEquals("赵阿三", result.get(2).getWord()); + Assert.assertEquals(0, result.get(2).getStartIndex().intValue()); + Assert.assertEquals(2, result.get(2).getEndIndex().intValue()); + } + /** * 停顿词测试 */ diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/SensitiveUtilTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/SensitiveUtilTest.java index ba7348b09..5bbbe9f23 100644 --- a/hutool-dfa/src/test/java/cn/hutool/dfa/SensitiveUtilTest.java +++ b/hutool-dfa/src/test/java/cn/hutool/dfa/SensitiveUtilTest.java @@ -1,5 +1,7 @@ package cn.hutool.dfa; +import cn.hutool.core.collection.ListUtil; +import lombok.Data; import org.junit.Assert; import org.junit.Test; @@ -24,25 +26,17 @@ public class SensitiveUtilTest { Assert.assertEquals(bean.getStr(), "我有一颗$****,***的"); } + @Data public static class TestBean { private String str; private Integer num; - - public String getStr() { - return str; - } - - public void setStr(String str) { - this.str = str; - } - - public Integer getNum() { - return num; - } - - public void setNum(Integer num) { - this.num = num; - } } + @Test + public void issue2126(){ + SensitiveUtil.init(ListUtil.of("赵", "赵阿", "赵阿三")); + + String result = SensitiveUtil.sensitiveFilter("赵阿三在做什么。", true, null); + Assert.assertEquals("***在做什么。", result); + } }