优化WordTree:

1.只有真正的叶子节点才创建“尾节点集合”;
2.除了根节点,其余节点的容量默认为1,避免无意义的浪费;
This commit is contained in:
emptypoint 2023-01-01 21:56:37 +08:00
parent 1d3c3a6c48
commit 7fc1ffe291

View File

@ -2,6 +2,7 @@ package cn.hutool.core.text.dfa;
import cn.hutool.core.collection.CollUtil; import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.collection.SetUtil; import cn.hutool.core.collection.SetUtil;
import cn.hutool.core.map.MapUtil;
import cn.hutool.core.text.StrUtil; import cn.hutool.core.text.StrUtil;
import java.util.ArrayList; import java.util.ArrayList;
@ -33,7 +34,7 @@ public class WordTree extends HashMap<Character, WordTree> {
/** /**
* 单词字符末尾标识用于标识单词末尾字符 * 单词字符末尾标识用于标识单词末尾字符
*/ */
private final Set<Character> endCharacterSet = new HashSet<>(); private Set<Character> endCharacterSet = null;
/** /**
* 字符过滤规则通过定义字符串过滤规则过滤不需要的字符当accept为false时此字符不参与匹配 * 字符过滤规则通过定义字符串过滤规则过滤不需要的字符当accept为false时此字符不参与匹配
*/ */
@ -46,6 +47,15 @@ public class WordTree extends HashMap<Character, WordTree> {
*/ */
public WordTree() { public WordTree() {
} }
/**
* 指定初始化容量
*
* @param initialCapacity 初始容量一般是关键词的数量
*/
public WordTree(final int initialCapacity) {
super((int) (initialCapacity / MapUtil.DEFAULT_LOAD_FACTOR) + 1);
}
//--------------------------------------------------------------------------------------- Constructor start //--------------------------------------------------------------------------------------- Constructor start
/** /**
@ -99,6 +109,9 @@ public class WordTree extends HashMap<Character, WordTree> {
* @return this * @return this
*/ */
public WordTree addWord(final String word) { public WordTree addWord(final String word) {
if (null == word) {
return this;
}
final Predicate<Character> charFilter = this.charFilter; final Predicate<Character> charFilter = this.charFilter;
WordTree parent = null; WordTree parent = null;
WordTree current = this; WordTree current = this;
@ -107,13 +120,10 @@ public class WordTree extends HashMap<Character, WordTree> {
final int length = word.length(); final int length = word.length();
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
currentChar = word.charAt(i); currentChar = word.charAt(i);
if (charFilter.test(currentChar)) {//只处理合法字符 //只处理合法字符
child = current.get(currentChar); if (charFilter.test(currentChar)) {
if (child == null) { //无子节点新建一个子节点后存放下一个字符子节点的同级节点不会有太多同级节点默认1个
//无子类新建一个子节点后存放下一个字符 child = current.computeIfAbsent(currentChar, c -> new WordTree(1));
child = new WordTree();
current.put(currentChar, child);
}
parent = current; parent = current;
current = child; current = child;
} }
@ -132,9 +142,7 @@ public class WordTree extends HashMap<Character, WordTree> {
* @return 是否包含 * @return 是否包含
*/ */
public boolean isMatch(final String text) { public boolean isMatch(final String text) {
if (null == text) { //被检查的文本大概率不是null由里层方法统一校验即可
return false;
}
return null != matchWord(text); return null != matchWord(text);
} }
@ -191,7 +199,7 @@ public class WordTree extends HashMap<Character, WordTree> {
* 找出所有匹配的关键字 * 找出所有匹配的关键字
* *
* @param text 被检查的文本 * @param text 被检查的文本
* @param limit 限制匹配个数 * @param limit 限制匹配个数如果小于等于0则返回全部匹配结果
* @return 匹配的词列表 * @return 匹配的词列表
*/ */
public List<String> matchAll(final String text, final int limit) { public List<String> matchAll(final String text, final int limit) {
@ -202,7 +210,7 @@ public class WordTree extends HashMap<Character, WordTree> {
* 找出所有匹配的关键字 * 找出所有匹配的关键字
* *
* @param text 被检查的文本 * @param text 被检查的文本
* @param limit 限制匹配个数 * @param limit 限制匹配个数如果小于等于0则返回全部匹配结果
* @return 匹配的词列表 * @return 匹配的词列表
* @since 5.5.3 * @since 5.5.3
*/ */
@ -212,11 +220,13 @@ public class WordTree extends HashMap<Character, WordTree> {
/** /**
* 找出所有匹配的关键字<br> * 找出所有匹配的关键字<br>
* 密集匹配原则假如关键词有 ab,b文本是abab将匹配 [ab,b,ab]<br> * <p>假如被检查文本是{@literal "abab"}<br>
* 密集匹配原则假如关键词有 ab,b将匹配 [ab,b,ab]<br>
* 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab] * 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab]
* </p>
* *
* @param text 被检查的文本 * @param text 被检查的文本
* @param limit 限制匹配个数 * @param limit 限制匹配个数如果小于等于0则返回全部匹配结果
* @param isDensityMatch 是否使用密集匹配原则 * @param isDensityMatch 是否使用密集匹配原则
* @param isGreedMatch 是否使用贪婪匹配最长匹配原则 * @param isGreedMatch 是否使用贪婪匹配最长匹配原则
* @return 匹配的词列表 * @return 匹配的词列表
@ -228,11 +238,13 @@ public class WordTree extends HashMap<Character, WordTree> {
/** /**
* 找出所有匹配的关键字<br> * 找出所有匹配的关键字<br>
* 密集匹配原则假如关键词有 ab,b文本是abab将匹配 [ab,b,ab]<br> * <p>假如被检查文本是{@literal "abab"}<br>
* 密集匹配原则假如关键词有 ab,b将匹配 [ab,b,ab]<br>
* 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab] * 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab]
* </p>
* *
* @param text 被检查的文本 * @param text 被检查的文本
* @param limit 限制匹配个数 * @param limit 限制匹配个数如果小于等于0则返回全部匹配结果
* @param isDensityMatch 是否使用密集匹配原则 * @param isDensityMatch 是否使用密集匹配原则
* @param isGreedMatch 是否使用贪婪匹配最长匹配原则 * @param isGreedMatch 是否使用贪婪匹配最长匹配原则
* @return 匹配的词列表 * @return 匹配的词列表
@ -243,20 +255,20 @@ public class WordTree extends HashMap<Character, WordTree> {
return null; return null;
} }
final List<FoundWord> foundWords = new ArrayList<>(); final List<FoundWord> foundWords = limit > 0 ? new ArrayList<>(limit) : new ArrayList<>();
WordTree current = this; WordTree current;
final int length = text.length(); final int length = text.length();
final Predicate<Character> charFilter = this.charFilter; final Predicate<Character> charFilter = this.charFilter;
//存放查找到的字符缓存完整出现一个词时加到findedWords中否则清空 //存放查找到的字符缓存完整出现一个词时加到foundWords中否则清空
final StringBuilder wordBuffer = StrUtil.builder(); final StringBuilder wordBuffer = StrUtil.builder();
final StringBuilder keyBuffer = StrUtil.builder(); final StringBuilder keyBuffer = StrUtil.builder();
char currentChar; char currentChar;
for (int i = 0; i < length; i++) { for (int i = 0; i < length; i++) {
current = this;
wordBuffer.setLength(0); wordBuffer.setLength(0);
keyBuffer.setLength(0); keyBuffer.setLength(0);
for (int j = i; j < length; j++) { for (int j = i; j < length; j++) {
currentChar = text.charAt(j); currentChar = text.charAt(j);
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
if (false == charFilter.test(currentChar)) { if (false == charFilter.test(currentChar)) {
if (wordBuffer.length() > 0) { if (wordBuffer.length() > 0) {
//做为关键词中间的停顿词被当作关键词的一部分被返回 //做为关键词中间的停顿词被当作关键词的一部分被返回
@ -294,7 +306,6 @@ public class WordTree extends HashMap<Character, WordTree> {
break; break;
} }
} }
current = this;
} }
return foundWords; return foundWords;
} }
@ -306,19 +317,21 @@ public class WordTree extends HashMap<Character, WordTree> {
* @param c 检查的字符 * @param c 检查的字符
* @return 是否末尾 * @return 是否末尾
*/ */
private boolean isEnd(final Character c) { private boolean isEnd(final char c) {
return this.endCharacterSet.contains(c); return null != endCharacterSet && this.endCharacterSet.contains(c);
} }
/** /**
* 设置是否到达末尾 * 设置到达末尾
* *
* @param c 设置结尾的字符 * @param c 设置结尾的字符
*/ */
private void setEnd(final Character c) { private void setEnd(final char c) {
if (null != c) { if (null == endCharacterSet) {
this.endCharacterSet.add(c); // 叶子节点一般也就1个元素
endCharacterSet = new HashSet<>(2);
} }
this.endCharacterSet.add(c);
} }
/** /**
@ -329,7 +342,9 @@ public class WordTree extends HashMap<Character, WordTree> {
@Override @Override
public void clear() { public void clear() {
super.clear(); super.clear();
if (null != endCharacterSet) {
this.endCharacterSet.clear(); this.endCharacterSet.clear();
} }
}
//--------------------------------------------------------------------------------------- Private method end //--------------------------------------------------------------------------------------- Private method end
} }