From 2e2d43d764870f261b0e9b365563af6dbac6d676 Mon Sep 17 00:00:00 2001 From: Looly Date: Tue, 3 Mar 2020 11:18:55 +0800 Subject: [PATCH] add filter support --- CHANGELOG.md | 1 + .../main/java/cn/hutool/core/lang/Filter.java | 3 +- .../cn/hutool/crypto/test/BCUtilTest.java | 3 + .../src/main/java/cn/hutool/dfa/WordTree.java | 140 +++++++++++------- .../test/java/cn/hutool/dfa/test/DfaTest.java | 12 +- 5 files changed, 99 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b990978c1..ad989f685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ * 【crypto】 RSA算法中,BlockSize长度策略调整(issue#721@Github) * 【crypto】 删除SM2Engine,使用BC库中的对象替代 * 【crypto】 增加PemUtil工具类 +* 【dfa 】 WordTree增加Filter,支持自定义特殊字符过滤器 ### Bug修复 diff --git a/hutool-core/src/main/java/cn/hutool/core/lang/Filter.java b/hutool-core/src/main/java/cn/hutool/core/lang/Filter.java index 0443da1e2..dce1b8a15 100644 --- a/hutool-core/src/main/java/cn/hutool/core/lang/Filter.java +++ b/hutool-core/src/main/java/cn/hutool/core/lang/Filter.java @@ -2,13 +2,14 @@ package cn.hutool.core.lang; /** * 过滤器接口 - * @author Looly * + * @author Looly */ @FunctionalInterface public interface Filter { /** * 是否接受对象 + * * @param t 检查的对象 * @return 是否接受对象 */ diff --git a/hutool-crypto/src/test/java/cn/hutool/crypto/test/BCUtilTest.java b/hutool-crypto/src/test/java/cn/hutool/crypto/test/BCUtilTest.java index b60ee9641..67989592a 100644 --- a/hutool-crypto/src/test/java/cn/hutool/crypto/test/BCUtilTest.java +++ b/hutool-crypto/src/test/java/cn/hutool/crypto/test/BCUtilTest.java @@ -9,6 +9,9 @@ import org.junit.Test; public class BCUtilTest { + /** + * 密钥生成来自:https://i.goto327.top/CryptTools/SM2.aspx?tdsourcetag=s_pctim_aiomsg + */ @Test public void createECPublicKeyParametersTest() { String x = "706AD9DAA3E5CEAC3DA59F583429E8043BAFC576BE10092C4EA4D8E19846CA62"; diff --git a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java index 87cb0af32..69f76e393 100644 --- a/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java +++ b/hutool-dfa/src/main/java/cn/hutool/dfa/WordTree.java @@ -1,4 +1,5 @@ package cn.hutool.dfa; + import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -7,6 +8,7 @@ import java.util.List; import java.util.Set; import cn.hutool.core.collection.CollectionUtil; +import cn.hutool.core.lang.Filter; import cn.hutool.core.text.StrBuilder; import cn.hutool.core.util.StrUtil; @@ -16,72 +18,94 @@ import cn.hutool.core.util.StrUtil; * 单词树使用group区分不同的关键字集合,不同的分组可以共享树枝,避免重复建树。
* 单词树使用树状结构表示一组单词。
* 例如:红领巾,红河构建树后为:
- * 红
- * / \
- * 领 河
- * /
- * 巾
- *其中每个节点都是一个WordTree对象,查找时从上向下查找。
- * @author Looly + * 红
+ * / \
+ * 领 河
+ * /
+ * 巾
+ * 其中每个节点都是一个WordTree对象,查找时从上向下查找。
* + * @author Looly */ -public class WordTree extends HashMap{ +public class WordTree extends HashMap { private static final long serialVersionUID = -4646423269465809276L; - + /** * 敏感词字符末尾标识,用于标识单词末尾字符 */ private Set endCharacterSet = new HashSet<>(); - + /** + * 字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符,当accept为false时,此字符不参与匹配 + */ + private Filter charFilter = StopChar::isNotStopChar; + //--------------------------------------------------------------------------------------- Constructor start + /** * 默认构造 */ public WordTree() { } //--------------------------------------------------------------------------------------- Constructor start - + + /** + * 设置字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符
+ * 当accept为false时,此字符不参与匹配 + * + * @param charFilter 过滤函数 + * @return this + * @since 5.2.0 + */ + public WordTree setCharFilter(Filter charFilter) { + this.charFilter = charFilter; + return this; + } + //------------------------------------------------------------------------------- add word - + /** * 增加一组单词 + * * @param words 单词集合 */ - public void addWords(Collection words){ - if(false == (words instanceof Set)){ + public void addWords(Collection words) { + if (false == (words instanceof Set)) { words = new HashSet<>(words); } for (String word : words) { addWord(word); } } - + /** * 增加一组单词 + * * @param words 单词数组 */ - public void addWords(String... words){ + public void addWords(String... words) { HashSet wordsSet = CollectionUtil.newHashSet(words); for (String word : wordsSet) { addWord(word); } } - + /** * 添加单词,使用默认类型 + * * @param word 单词 */ public void addWord(String word) { + final Filter charFilter = this.charFilter; WordTree parent = null; WordTree current = this; WordTree child; char currentChar = 0; int length = word.length(); - for(int i = 0; i < length; i++){ + for (int i = 0; i < length; i++) { currentChar = word.charAt(i); - if(false == StopChar.isStopChar(currentChar)){//只处理合法字符 + if (charFilter.accept(currentChar)) {//只处理合法字符 child = current.get(currentChar); - if(child == null){ + if (child == null) { //无子类,新建一个子节点后存放下一个字符 child = new WordTree(); current.put(currentChar, child); @@ -90,79 +114,86 @@ public class WordTree extends HashMap{ current = child; } } - if(null != parent){ + if (null != parent) { parent.setEnd(currentChar); } } - + //------------------------------------------------------------------------------- match + /** * 指定文本是否包含树中的词 + * * @param text 被检查的文本 * @return 是否包含 */ - public boolean isMatch(String text){ - if(null == text){ + public boolean isMatch(String text) { + if (null == text) { return false; } return null != match(text); } - + /** * 获得第一个匹配的关键字 + * * @param text 被检查的文本 * @return 匹配到的关键字 */ - public String match(String text){ - if(null == text){ + public String match(String text) { + if (null == text) { return null; } List matchAll = matchAll(text, 1); - if(CollectionUtil.isNotEmpty(matchAll)){ + if (CollectionUtil.isNotEmpty(matchAll)) { return matchAll.get(0); } return null; } - + //------------------------------------------------------------------------------- match all + /** * 找出所有匹配的关键字 + * * @param text 被检查的文本 * @return 匹配的词列表 */ public List matchAll(String text) { return matchAll(text, -1); } - + /** * 找出所有匹配的关键字 - * @param text 被检查的文本 + * + * @param text 被检查的文本 * @param limit 限制匹配个数 * @return 匹配的词列表 */ public List matchAll(String text, int limit) { return matchAll(text, limit, false, false); } - + /** * 找出所有匹配的关键字
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab] - * - * @param text 被检查的文本 - * @param limit 限制匹配个数 + * + * @param text 被检查的文本 + * @param limit 限制匹配个数 * @param isDensityMatch 是否使用密集匹配原则 - * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 + * @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则 * @return 匹配的词列表 */ public List matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) { - if(null == text){ + if (null == text) { return null; } - + List foundWords = new ArrayList<>(); WordTree current = this; int length = text.length(); + final Filter charFilter = this.charFilter; //存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空 final StrBuilder wordBuffer = StrUtil.strBuilder(); char currentChar; @@ -171,38 +202,38 @@ public class WordTree extends HashMap{ for (int j = i; j < length; j++) { currentChar = text.charAt(j); // Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar); - if(StopChar.isStopChar(currentChar)){ - if(wordBuffer.length() > 0){ + if (false == charFilter.accept(currentChar)) { + if (wordBuffer.length() > 0) { //做为关键词中间的停顿词被当作关键词的一部分被返回 wordBuffer.append(currentChar); - }else{ + } else { //停顿词做为关键词的第一个字符时需要跳过 i++; } continue; - }else if(false == current.containsKey(currentChar)){ + } else if (false == current.containsKey(currentChar)) { //非关键字符被整体略过,重新以下个字符开始检查 break; } wordBuffer.append(currentChar); - if(current.isEnd(currentChar)){ + if (current.isEnd(currentChar)) { //到达单词末尾,关键词成立,从此词的下一个位置开始查找 foundWords.add(wordBuffer.toString()); - if(limit > 0 && foundWords.size() >= limit){ + if (limit > 0 && foundWords.size() >= limit) { //超过匹配限制个数,直接返回 return foundWords; } - if(false == isDensityMatch){ + if (false == isDensityMatch) { //如果非密度匹配,跳过匹配到的词 i = j; } - if(false == isGreedMatch){ + if (false == isGreedMatch) { //如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配 break; } } current = current.get(currentChar); - if(null == current){ + if (null == current) { break; } } @@ -210,24 +241,27 @@ public class WordTree extends HashMap{ } return foundWords; } - - + + //--------------------------------------------------------------------------------------- Private method start + /** * 是否末尾 + * * @param c 检查的字符 * @return 是否末尾 */ - private boolean isEnd(Character c){ + private boolean isEnd(Character c) { return this.endCharacterSet.contains(c); } - + /** * 设置是否到达末尾 + * * @param c 设置结尾的字符 */ - private void setEnd(Character c){ - if(null != c){ + private void setEnd(Character c) { + if (null != c) { this.endCharacterSet.add(c); } } diff --git a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java index f6e338155..9d500f870 100644 --- a/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java +++ b/hutool-dfa/src/test/java/cn/hutool/dfa/test/DfaTest.java @@ -16,8 +16,8 @@ import cn.hutool.dfa.WordTree; */ public class DfaTest { - // 构建被查询的文本 - String text = "我有一颗大土豆,刚出锅的"; + // 构建被查询的文本,包含停顿词 + String text = "我有一颗$大土^豆,刚出锅的"; @Test public void matchAllTest() { @@ -29,7 +29,7 @@ public class DfaTest { // 匹配到【大】,就不再继续匹配了,因此【大土豆】不匹配 // 匹配到【刚出锅】,就跳过这三个字了,因此【出锅】不匹配(由于刚首先被匹配,因此长的被匹配,最短匹配只针对第一个字相同选最短) List matchAll = tree.matchAll(text, -1, false, false); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅")); + Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅")); } /** @@ -45,7 +45,7 @@ public class DfaTest { // 【大】被匹配,最短匹配原则【大土豆】被跳过,【土豆继续被匹配】 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 List matchAll = tree.matchAll(text, -1, true, false); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土豆", "刚出锅", "出锅")); + Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "土^豆", "刚出锅", "出锅")); } /** @@ -61,7 +61,7 @@ public class DfaTest { // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配 // 由于【大土豆】被匹配,【土豆】被跳过,由于【刚出锅】被匹配,【出锅】被跳过 List matchAll = tree.matchAll(text, -1, false, true); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "刚出锅")); + Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "刚出锅")); } @@ -78,7 +78,7 @@ public class DfaTest { // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 List matchAll = tree.matchAll(text, -1, true, true); - Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土豆", "土豆", "刚出锅", "出锅")); + Assert.assertEquals(matchAll, CollectionUtil.newArrayList("大", "大土^豆", "土^豆", "刚出锅", "出锅")); }