add filter support

This commit is contained in:
Looly 2020-03-03 11:18:55 +08:00
parent 8fba51f62b
commit 2e2d43d764
5 changed files with 99 additions and 60 deletions

View File

@ -20,6 +20,7 @@
* 【crypto】 RSA算法中BlockSize长度策略调整issue#721@Github * 【crypto】 RSA算法中BlockSize长度策略调整issue#721@Github
* 【crypto】 删除SM2Engine使用BC库中的对象替代 * 【crypto】 删除SM2Engine使用BC库中的对象替代
* 【crypto】 增加PemUtil工具类 * 【crypto】 增加PemUtil工具类
* 【dfa 】 WordTree增加Filter支持自定义特殊字符过滤器
### Bug修复 ### Bug修复

View File

@ -2,13 +2,14 @@ package cn.hutool.core.lang;
/** /**
* 过滤器接口 * 过滤器接口
* @author Looly
* *
* @author Looly
*/ */
@FunctionalInterface @FunctionalInterface
public interface Filter<T> { public interface Filter<T> {
/** /**
* 是否接受对象 * 是否接受对象
*
* @param t 检查的对象 * @param t 检查的对象
* @return 是否接受对象 * @return 是否接受对象
*/ */

View File

@ -9,6 +9,9 @@ import org.junit.Test;
public class BCUtilTest { public class BCUtilTest {
/**
* 密钥生成来自https://i.goto327.top/CryptTools/SM2.aspx?tdsourcetag=s_pctim_aiomsg
*/
@Test @Test
public void createECPublicKeyParametersTest() { public void createECPublicKeyParametersTest() {
String x = "706AD9DAA3E5CEAC3DA59F583429E8043BAFC576BE10092C4EA4D8E19846CA62"; String x = "706AD9DAA3E5CEAC3DA59F583429E8043BAFC576BE10092C4EA4D8E19846CA62";

View File

@ -1,4 +1,5 @@
package cn.hutool.dfa; package cn.hutool.dfa;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
@ -7,6 +8,7 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.lang.Filter;
import cn.hutool.core.text.StrBuilder; import cn.hutool.core.text.StrBuilder;
import cn.hutool.core.util.StrUtil; import cn.hutool.core.util.StrUtil;
@ -21,19 +23,24 @@ import cn.hutool.core.util.StrUtil;
* <br> * <br>
* / <br> * / <br>
* <br> * <br>
*其中每个节点都是一个WordTree对象查找时从上向下查找<br> * 其中每个节点都是一个WordTree对象查找时从上向下查找<br>
* @author Looly
* *
* @author Looly
*/ */
public class WordTree extends HashMap<Character, WordTree>{ public class WordTree extends HashMap<Character, WordTree> {
private static final long serialVersionUID = -4646423269465809276L; private static final long serialVersionUID = -4646423269465809276L;
/** /**
* 敏感词字符末尾标识用于标识单词末尾字符 * 敏感词字符末尾标识用于标识单词末尾字符
*/ */
private Set<Character> endCharacterSet = new HashSet<>(); private Set<Character> endCharacterSet = new HashSet<>();
/**
* 字符过滤规则通过定义字符串过滤规则过滤不需要的字符当accept为false时此字符不参与匹配
*/
private Filter<Character> charFilter = StopChar::isNotStopChar;
//--------------------------------------------------------------------------------------- Constructor start //--------------------------------------------------------------------------------------- Constructor start
/** /**
* 默认构造 * 默认构造
*/ */
@ -41,14 +48,28 @@ public class WordTree extends HashMap<Character, WordTree>{
} }
//--------------------------------------------------------------------------------------- Constructor start //--------------------------------------------------------------------------------------- Constructor start
/**
* 设置字符过滤规则通过定义字符串过滤规则过滤不需要的字符<br>
* 当accept为false时此字符不参与匹配
*
* @param charFilter 过滤函数
* @return this
* @since 5.2.0
*/
public WordTree setCharFilter(Filter<Character> charFilter) {
this.charFilter = charFilter;
return this;
}
//------------------------------------------------------------------------------- add word //------------------------------------------------------------------------------- add word
/** /**
* 增加一组单词 * 增加一组单词
*
* @param words 单词集合 * @param words 单词集合
*/ */
public void addWords(Collection<String> words){ public void addWords(Collection<String> words) {
if(false == (words instanceof Set)){ if (false == (words instanceof Set)) {
words = new HashSet<>(words); words = new HashSet<>(words);
} }
for (String word : words) { for (String word : words) {
@ -58,9 +79,10 @@ public class WordTree extends HashMap<Character, WordTree>{
/** /**
* 增加一组单词 * 增加一组单词
*
* @param words 单词数组 * @param words 单词数组
*/ */
public void addWords(String... words){ public void addWords(String... words) {
HashSet<String> wordsSet = CollectionUtil.newHashSet(words); HashSet<String> wordsSet = CollectionUtil.newHashSet(words);
for (String word : wordsSet) { for (String word : wordsSet) {
addWord(word); addWord(word);
@ -69,19 +91,21 @@ public class WordTree extends HashMap<Character, WordTree>{
/** /**
* 添加单词使用默认类型 * 添加单词使用默认类型
*
* @param word 单词 * @param word 单词
*/ */
public void addWord(String word) { public void addWord(String word) {
final Filter<Character> charFilter = this.charFilter;
WordTree parent = null; WordTree parent = null;
WordTree current = this; WordTree current = this;
WordTree child; WordTree child;
char currentChar = 0; char currentChar = 0;
int length = word.length(); int length = word.length();
for(int i = 0; i < length; i++){ for (int i = 0; i < length; i++) {
currentChar = word.charAt(i); currentChar = word.charAt(i);
if(false == StopChar.isStopChar(currentChar)){//只处理合法字符 if (charFilter.accept(currentChar)) {//只处理合法字符
child = current.get(currentChar); child = current.get(currentChar);
if(child == null){ if (child == null) {
//无子类新建一个子节点后存放下一个字符 //无子类新建一个子节点后存放下一个字符
child = new WordTree(); child = new WordTree();
current.put(currentChar, child); current.put(currentChar, child);
@ -90,19 +114,21 @@ public class WordTree extends HashMap<Character, WordTree>{
current = child; current = child;
} }
} }
if(null != parent){ if (null != parent) {
parent.setEnd(currentChar); parent.setEnd(currentChar);
} }
} }
//------------------------------------------------------------------------------- match //------------------------------------------------------------------------------- match
/** /**
* 指定文本是否包含树中的词 * 指定文本是否包含树中的词
*
* @param text 被检查的文本 * @param text 被检查的文本
* @return 是否包含 * @return 是否包含
*/ */
public boolean isMatch(String text){ public boolean isMatch(String text) {
if(null == text){ if (null == text) {
return false; return false;
} }
return null != match(text); return null != match(text);
@ -110,23 +136,26 @@ public class WordTree extends HashMap<Character, WordTree>{
/** /**
* 获得第一个匹配的关键字 * 获得第一个匹配的关键字
*
* @param text 被检查的文本 * @param text 被检查的文本
* @return 匹配到的关键字 * @return 匹配到的关键字
*/ */
public String match(String text){ public String match(String text) {
if(null == text){ if (null == text) {
return null; return null;
} }
List<String> matchAll = matchAll(text, 1); List<String> matchAll = matchAll(text, 1);
if(CollectionUtil.isNotEmpty(matchAll)){ if (CollectionUtil.isNotEmpty(matchAll)) {
return matchAll.get(0); return matchAll.get(0);
} }
return null; return null;
} }
//------------------------------------------------------------------------------- match all //------------------------------------------------------------------------------- match all
/** /**
* 找出所有匹配的关键字 * 找出所有匹配的关键字
*
* @param text 被检查的文本 * @param text 被检查的文本
* @return 匹配的词列表 * @return 匹配的词列表
*/ */
@ -136,6 +165,7 @@ public class WordTree extends HashMap<Character, WordTree>{
/** /**
* 找出所有匹配的关键字 * 找出所有匹配的关键字
*
* @param text 被检查的文本 * @param text 被检查的文本
* @param limit 限制匹配个数 * @param limit 限制匹配个数
* @return 匹配的词列表 * @return 匹配的词列表
@ -156,13 +186,14 @@ public class WordTree extends HashMap<Character, WordTree>{
* @return 匹配的词列表 * @return 匹配的词列表
*/ */
public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) { public List<String> matchAll(String text, int limit, boolean isDensityMatch, boolean isGreedMatch) {
if(null == text){ if (null == text) {
return null; return null;
} }
List<String> foundWords = new ArrayList<>(); List<String> foundWords = new ArrayList<>();
WordTree current = this; WordTree current = this;
int length = text.length(); int length = text.length();
final Filter<Character> charFilter = this.charFilter;
//存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空 //存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空
final StrBuilder wordBuffer = StrUtil.strBuilder(); final StrBuilder wordBuffer = StrUtil.strBuilder();
char currentChar; char currentChar;
@ -171,38 +202,38 @@ public class WordTree extends HashMap<Character, WordTree>{
for (int j = i; j < length; j++) { for (int j = i; j < length; j++) {
currentChar = text.charAt(j); currentChar = text.charAt(j);
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar); // Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
if(StopChar.isStopChar(currentChar)){ if (false == charFilter.accept(currentChar)) {
if(wordBuffer.length() > 0){ if (wordBuffer.length() > 0) {
//做为关键词中间的停顿词被当作关键词的一部分被返回 //做为关键词中间的停顿词被当作关键词的一部分被返回
wordBuffer.append(currentChar); wordBuffer.append(currentChar);
}else{ } else {
//停顿词做为关键词的第一个字符时需要跳过 //停顿词做为关键词的第一个字符时需要跳过
i++; i++;
} }
continue; continue;
}else if(false == current.containsKey(currentChar)){ } else if (false == current.containsKey(currentChar)) {
//非关键字符被整体略过重新以下个字符开始检查 //非关键字符被整体略过重新以下个字符开始检查
break; break;
} }
wordBuffer.append(currentChar); wordBuffer.append(currentChar);
if(current.isEnd(currentChar)){ if (current.isEnd(currentChar)) {
//到达单词末尾关键词成立从此词的下一个位置开始查找 //到达单词末尾关键词成立从此词的下一个位置开始查找
foundWords.add(wordBuffer.toString()); foundWords.add(wordBuffer.toString());
if(limit > 0 && foundWords.size() >= limit){ if (limit > 0 && foundWords.size() >= limit) {
//超过匹配限制个数直接返回 //超过匹配限制个数直接返回
return foundWords; return foundWords;
} }
if(false == isDensityMatch){ if (false == isDensityMatch) {
//如果非密度匹配跳过匹配到的词 //如果非密度匹配跳过匹配到的词
i = j; i = j;
} }
if(false == isGreedMatch){ if (false == isGreedMatch) {
//如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配 //如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配
break; break;
} }
} }
current = current.get(currentChar); current = current.get(currentChar);
if(null == current){ if (null == current) {
break; break;
} }
} }
@ -213,21 +244,24 @@ public class WordTree extends HashMap<Character, WordTree>{
//--------------------------------------------------------------------------------------- Private method start //--------------------------------------------------------------------------------------- Private method start
/** /**
* 是否末尾 * 是否末尾
*
* @param c 检查的字符 * @param c 检查的字符
* @return 是否末尾 * @return 是否末尾
*/ */
private boolean isEnd(Character c){ private boolean isEnd(Character c) {
return this.endCharacterSet.contains(c); return this.endCharacterSet.contains(c);
} }
/** /**
* 设置是否到达末尾 * 设置是否到达末尾
*
* @param c 设置结尾的字符 * @param c 设置结尾的字符
*/ */
private void setEnd(Character c){ private void setEnd(Character c) {
if(null != c){ if (null != c) {
this.endCharacterSet.add(c); this.endCharacterSet.add(c);
} }
} }

View File

@ -16,8 +16,8 @@ import cn.hutool.dfa.WordTree;
*/ */
public class DfaTest { public class DfaTest {
// 构建被查询的文本 // 构建被查询的文本包含停顿词
String text = "我有一颗大土豆,刚出锅的"; String text = "我有一颗$大土^豆,刚出锅的";
@Test @Test
public void matchAllTest() { public void matchAllTest() {
@ -29,7 +29,7 @@ public class DfaTest {
// 匹配到就不再继续匹配了因此大土豆不匹配 // 匹配到就不再继续匹配了因此大土豆不匹配
// 匹配到刚出锅就跳过这三个字了因此出锅不匹配由于刚首先被匹配因此长的被匹配最短匹配只针对第一个字相同选最短 // 匹配到刚出锅就跳过这三个字了因此出锅不匹配由于刚首先被匹配因此长的被匹配最短匹配只针对第一个字相同选最短
List<String> matchAll = tree.matchAll(text, -1, false, false); List<String> matchAll = tree.matchAll(text, -1, false, false);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "", "刚出锅")); Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "^", "刚出锅"));
} }
/** /**
@ -45,7 +45,7 @@ public class DfaTest {
// 被匹配最短匹配原则大土豆被跳过土豆继续被匹配 // 被匹配最短匹配原则大土豆被跳过土豆继续被匹配
// 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配 // 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配
List<String> matchAll = tree.matchAll(text, -1, true, false); List<String> matchAll = tree.matchAll(text, -1, true, false);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "", "刚出锅", "出锅")); Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "^", "刚出锅", "出锅"));
} }
/** /**
@ -61,7 +61,7 @@ public class DfaTest {
// 匹配到由于到最长匹配因此大土豆接着被匹配 // 匹配到由于到最长匹配因此大土豆接着被匹配
// 由于大土豆被匹配土豆被跳过由于刚出锅被匹配出锅被跳过 // 由于大土豆被匹配土豆被跳过由于刚出锅被匹配出锅被跳过
List<String> matchAll = tree.matchAll(text, -1, false, true); List<String> matchAll = tree.matchAll(text, -1, false, true);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "大土", "刚出锅")); Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "大土^", "刚出锅"));
} }
@ -78,7 +78,7 @@ public class DfaTest {
// 匹配到由于到最长匹配因此大土豆接着被匹配由于不跳过已经匹配的关键词土豆继续被匹配 // 匹配到由于到最长匹配因此大土豆接着被匹配由于不跳过已经匹配的关键词土豆继续被匹配
// 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配 // 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配
List<String> matchAll = tree.matchAll(text, -1, true, true); List<String> matchAll = tree.matchAll(text, -1, true, true);
Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "大土", "", "刚出锅", "出锅")); Assert.assertEquals(matchAll, CollectionUtil.newArrayList("", "大土^", "^", "刚出锅", "出锅"));
} }