mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-05-09 23:51:34 +08:00
优化WordTree:
1.只有真正的叶子节点才创建“尾节点集合”; 2.除了根节点,其余节点的容量默认为1,避免无意义的浪费;
This commit is contained in:
parent
1d3c3a6c48
commit
7fc1ffe291
@ -2,6 +2,7 @@ package cn.hutool.core.text.dfa;
|
|||||||
|
|
||||||
import cn.hutool.core.collection.CollUtil;
|
import cn.hutool.core.collection.CollUtil;
|
||||||
import cn.hutool.core.collection.SetUtil;
|
import cn.hutool.core.collection.SetUtil;
|
||||||
|
import cn.hutool.core.map.MapUtil;
|
||||||
import cn.hutool.core.text.StrUtil;
|
import cn.hutool.core.text.StrUtil;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -33,7 +34,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
/**
|
/**
|
||||||
* 单词字符末尾标识,用于标识单词末尾字符
|
* 单词字符末尾标识,用于标识单词末尾字符
|
||||||
*/
|
*/
|
||||||
private final Set<Character> endCharacterSet = new HashSet<>();
|
private Set<Character> endCharacterSet = null;
|
||||||
/**
|
/**
|
||||||
* 字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符,当accept为false时,此字符不参与匹配
|
* 字符过滤规则,通过定义字符串过滤规则,过滤不需要的字符,当accept为false时,此字符不参与匹配
|
||||||
*/
|
*/
|
||||||
@ -46,6 +47,15 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
*/
|
*/
|
||||||
public WordTree() {
|
public WordTree() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 指定初始化容量
|
||||||
|
*
|
||||||
|
* @param initialCapacity 初始容量,一般是关键词的数量
|
||||||
|
*/
|
||||||
|
public WordTree(final int initialCapacity) {
|
||||||
|
super((int) (initialCapacity / MapUtil.DEFAULT_LOAD_FACTOR) + 1);
|
||||||
|
}
|
||||||
//--------------------------------------------------------------------------------------- Constructor start
|
//--------------------------------------------------------------------------------------- Constructor start
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -99,6 +109,9 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
* @return this
|
* @return this
|
||||||
*/
|
*/
|
||||||
public WordTree addWord(final String word) {
|
public WordTree addWord(final String word) {
|
||||||
|
if (null == word) {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
final Predicate<Character> charFilter = this.charFilter;
|
final Predicate<Character> charFilter = this.charFilter;
|
||||||
WordTree parent = null;
|
WordTree parent = null;
|
||||||
WordTree current = this;
|
WordTree current = this;
|
||||||
@ -107,13 +120,10 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
final int length = word.length();
|
final int length = word.length();
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
currentChar = word.charAt(i);
|
currentChar = word.charAt(i);
|
||||||
if (charFilter.test(currentChar)) {//只处理合法字符
|
//只处理合法字符
|
||||||
child = current.get(currentChar);
|
if (charFilter.test(currentChar)) {
|
||||||
if (child == null) {
|
//无子节点,新建一个子节点后存放下一个字符,子节点的同级节点不会有太多同级节点,默认1个
|
||||||
//无子类,新建一个子节点后存放下一个字符
|
child = current.computeIfAbsent(currentChar, c -> new WordTree(1));
|
||||||
child = new WordTree();
|
|
||||||
current.put(currentChar, child);
|
|
||||||
}
|
|
||||||
parent = current;
|
parent = current;
|
||||||
current = child;
|
current = child;
|
||||||
}
|
}
|
||||||
@ -132,9 +142,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
* @return 是否包含
|
* @return 是否包含
|
||||||
*/
|
*/
|
||||||
public boolean isMatch(final String text) {
|
public boolean isMatch(final String text) {
|
||||||
if (null == text) {
|
//被检查的文本大概率不是null,由里层方法统一校验即可
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return null != matchWord(text);
|
return null != matchWord(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -191,7 +199,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
* 找出所有匹配的关键字
|
* 找出所有匹配的关键字
|
||||||
*
|
*
|
||||||
* @param text 被检查的文本
|
* @param text 被检查的文本
|
||||||
* @param limit 限制匹配个数
|
* @param limit 限制匹配个数,如果小于等于0,则返回全部匹配结果
|
||||||
* @return 匹配的词列表
|
* @return 匹配的词列表
|
||||||
*/
|
*/
|
||||||
public List<String> matchAll(final String text, final int limit) {
|
public List<String> matchAll(final String text, final int limit) {
|
||||||
@ -202,7 +210,7 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
* 找出所有匹配的关键字
|
* 找出所有匹配的关键字
|
||||||
*
|
*
|
||||||
* @param text 被检查的文本
|
* @param text 被检查的文本
|
||||||
* @param limit 限制匹配个数
|
* @param limit 限制匹配个数,如果小于等于0,则返回全部匹配结果
|
||||||
* @return 匹配的词列表
|
* @return 匹配的词列表
|
||||||
* @since 5.5.3
|
* @since 5.5.3
|
||||||
*/
|
*/
|
||||||
@ -212,11 +220,13 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 找出所有匹配的关键字<br>
|
* 找出所有匹配的关键字<br>
|
||||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
* <p>假如被检查文本是{@literal "abab"}<br>
|
||||||
|
* 密集匹配原则:假如关键词有 ab,b,将匹配 [ab,b,ab]<br>
|
||||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
* @param text 被检查的文本
|
* @param text 被检查的文本
|
||||||
* @param limit 限制匹配个数
|
* @param limit 限制匹配个数,如果小于等于0,则返回全部匹配结果
|
||||||
* @param isDensityMatch 是否使用密集匹配原则
|
* @param isDensityMatch 是否使用密集匹配原则
|
||||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||||
* @return 匹配的词列表
|
* @return 匹配的词列表
|
||||||
@ -228,11 +238,13 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 找出所有匹配的关键字<br>
|
* 找出所有匹配的关键字<br>
|
||||||
* 密集匹配原则:假如关键词有 ab,b,文本是abab,将匹配 [ab,b,ab]<br>
|
* <p>假如被检查文本是{@literal "abab"}<br>
|
||||||
|
* 密集匹配原则:假如关键词有 ab,b,将匹配 [ab,b,ab]<br>
|
||||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
* @param text 被检查的文本
|
* @param text 被检查的文本
|
||||||
* @param limit 限制匹配个数
|
* @param limit 限制匹配个数,如果小于等于0,则返回全部匹配结果
|
||||||
* @param isDensityMatch 是否使用密集匹配原则
|
* @param isDensityMatch 是否使用密集匹配原则
|
||||||
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
* @param isGreedMatch 是否使用贪婪匹配(最长匹配)原则
|
||||||
* @return 匹配的词列表
|
* @return 匹配的词列表
|
||||||
@ -243,20 +255,20 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<FoundWord> foundWords = new ArrayList<>();
|
final List<FoundWord> foundWords = limit > 0 ? new ArrayList<>(limit) : new ArrayList<>();
|
||||||
WordTree current = this;
|
WordTree current;
|
||||||
final int length = text.length();
|
final int length = text.length();
|
||||||
final Predicate<Character> charFilter = this.charFilter;
|
final Predicate<Character> charFilter = this.charFilter;
|
||||||
//存放查找到的字符缓存。完整出现一个词时加到findedWords中,否则清空
|
//存放查找到的字符缓存。完整出现一个词时加到foundWords中,否则清空
|
||||||
final StringBuilder wordBuffer = StrUtil.builder();
|
final StringBuilder wordBuffer = StrUtil.builder();
|
||||||
final StringBuilder keyBuffer = StrUtil.builder();
|
final StringBuilder keyBuffer = StrUtil.builder();
|
||||||
char currentChar;
|
char currentChar;
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
|
current = this;
|
||||||
wordBuffer.setLength(0);
|
wordBuffer.setLength(0);
|
||||||
keyBuffer.setLength(0);
|
keyBuffer.setLength(0);
|
||||||
for (int j = i; j < length; j++) {
|
for (int j = i; j < length; j++) {
|
||||||
currentChar = text.charAt(j);
|
currentChar = text.charAt(j);
|
||||||
// Console.log("i: {}, j: {}, currentChar: {}", i, j, currentChar);
|
|
||||||
if (false == charFilter.test(currentChar)) {
|
if (false == charFilter.test(currentChar)) {
|
||||||
if (wordBuffer.length() > 0) {
|
if (wordBuffer.length() > 0) {
|
||||||
//做为关键词中间的停顿词被当作关键词的一部分被返回
|
//做为关键词中间的停顿词被当作关键词的一部分被返回
|
||||||
@ -294,7 +306,6 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
current = this;
|
|
||||||
}
|
}
|
||||||
return foundWords;
|
return foundWords;
|
||||||
}
|
}
|
||||||
@ -306,19 +317,21 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
* @param c 检查的字符
|
* @param c 检查的字符
|
||||||
* @return 是否末尾
|
* @return 是否末尾
|
||||||
*/
|
*/
|
||||||
private boolean isEnd(final Character c) {
|
private boolean isEnd(final char c) {
|
||||||
return this.endCharacterSet.contains(c);
|
return null != endCharacterSet && this.endCharacterSet.contains(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 设置是否到达末尾
|
* 设置已到达末尾
|
||||||
*
|
*
|
||||||
* @param c 设置结尾的字符
|
* @param c 设置结尾的字符
|
||||||
*/
|
*/
|
||||||
private void setEnd(final Character c) {
|
private void setEnd(final char c) {
|
||||||
if (null != c) {
|
if (null == endCharacterSet) {
|
||||||
this.endCharacterSet.add(c);
|
// 叶子节点一般也就1个元素
|
||||||
|
endCharacterSet = new HashSet<>(2);
|
||||||
}
|
}
|
||||||
|
this.endCharacterSet.add(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -329,7 +342,9 @@ public class WordTree extends HashMap<Character, WordTree> {
|
|||||||
@Override
|
@Override
|
||||||
public void clear() {
|
public void clear() {
|
||||||
super.clear();
|
super.clear();
|
||||||
|
if (null != endCharacterSet) {
|
||||||
this.endCharacterSet.clear();
|
this.endCharacterSet.clear();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
//--------------------------------------------------------------------------------------- Private method end
|
//--------------------------------------------------------------------------------------- Private method end
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user