mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-04-19 03:01:48 +08:00
AC自动机实现
This commit is contained in:
parent
f78f9569e6
commit
1578a46761
@ -43,10 +43,7 @@ import java.util.HashSet;
|
|||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
import java.util.function.*;
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.function.Supplier;
|
|
||||||
import java.util.function.UnaryOperator;
|
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -1730,7 +1727,7 @@ public class CharSequenceUtil extends StrValidator {
|
|||||||
|
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
final int subLen = toIndex - fromIndex;
|
final int subLen = toIndex - fromIndex;
|
||||||
str.toString().codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v)));
|
str.codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v)));
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4131,4 +4128,28 @@ public class CharSequenceUtil extends StrValidator {
|
|||||||
}
|
}
|
||||||
return (isCodePoint ? str.codePoints() : str.chars()).toArray();
|
return (isCodePoint ? str.codePoints() : str.chars()).toArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 遍历字符串的每个字符,并处理
|
||||||
|
*
|
||||||
|
* @param str 字符串
|
||||||
|
* @param consumer 字符处理
|
||||||
|
*/
|
||||||
|
public static void forEach(final CharSequence str, final Consumer<Character> consumer) {
|
||||||
|
forEach(str, false, (cInt)-> consumer.accept((char) cInt));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 遍历字符串的每个字符,并处理
|
||||||
|
*
|
||||||
|
* @param str 字符串
|
||||||
|
* @param isCodePoint 是否为Unicode码点(即支持emoji等多char字符)
|
||||||
|
* @param consumer 字符处理
|
||||||
|
*/
|
||||||
|
public static void forEach(final CharSequence str, final boolean isCodePoint, final IntConsumer consumer) {
|
||||||
|
if (null == str) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
(isCodePoint ? str.codePoints() : str.chars()).forEach(consumer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,51 +1,56 @@
|
|||||||
package org.dromara.hutool.core.text.finder;
|
package org.dromara.hutool.core.text.finder;
|
||||||
|
|
||||||
|
import org.dromara.hutool.core.text.StrUtil;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 多字符串查询器 底层思路 使用 AC 自动机实现
|
* 多字符串查询器 底层思路 使用 AC 自动机实现
|
||||||
|
*
|
||||||
* @author newshiJ
|
* @author newshiJ
|
||||||
* @date 2024/8/2 上午10:07
|
|
||||||
*/
|
*/
|
||||||
public class MultiStrFinder {
|
public class MultiStrFinder {
|
||||||
|
|
||||||
// 字符索引
|
/**
|
||||||
protected final Map<Character,Integer> charIndex = new HashMap<>();
|
* 创建多字符串查询器
|
||||||
|
* @param source 字符串集合
|
||||||
|
* @return 多字符串查询器
|
||||||
|
*/
|
||||||
|
public static MultiStrFinder of(final Collection<String> source) {
|
||||||
|
return new MultiStrFinder(source);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 字符索引
|
||||||
|
protected final Map<Character, Integer> charIndexMap = new HashMap<>();
|
||||||
// 全部字符数量
|
// 全部字符数量
|
||||||
protected final int allCharSize;
|
protected final int allCharSize;
|
||||||
|
|
||||||
// 根节点
|
// 根节点
|
||||||
protected final Node root;
|
protected final Node root;
|
||||||
|
|
||||||
// 全部节点数量
|
// 全部节点数量
|
||||||
int nodeSize;
|
int nodeSize;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 构建多字符串查询器
|
* 构建多字符串查询器
|
||||||
* @param source
|
*
|
||||||
|
* @param source 字符串集合
|
||||||
*/
|
*/
|
||||||
public MultiStrFinder(Collection<String> source){
|
public MultiStrFinder(final Collection<String> source) {
|
||||||
// 待匹配的字符串
|
// 待匹配的字符串
|
||||||
final Set<String> stringSet = new HashSet<>();
|
final Set<String> stringSet = new HashSet<>();
|
||||||
|
|
||||||
// 所有字符
|
// 所有字符
|
||||||
final Set<Character> charSet = new HashSet<>();
|
final Set<Character> charSet = new HashSet<>();
|
||||||
for (String string : source) {
|
for (final String string : source) {
|
||||||
stringSet.add(string);
|
stringSet.add(string);
|
||||||
char[] charArray = string.toCharArray();
|
StrUtil.forEach(string, charSet::add);
|
||||||
for (char c : charArray) {
|
|
||||||
charSet.add(c);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
allCharSize = charSet.size();
|
allCharSize = charSet.size();
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (Character c : charSet) {
|
for (final Character c : charSet) {
|
||||||
charIndex.put(c,index);
|
charIndexMap.put(c,index);
|
||||||
index ++;
|
index ++;
|
||||||
}
|
}
|
||||||
|
this.root = Node.createRoot(index);
|
||||||
root = Node.createRoot(allCharSize);
|
|
||||||
|
|
||||||
buildPrefixTree(stringSet);
|
buildPrefixTree(stringSet);
|
||||||
buildFail();
|
buildFail();
|
||||||
@ -53,17 +58,16 @@ public class MultiStrFinder {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 构建前缀树
|
* 构建前缀树
|
||||||
|
*
|
||||||
* @param stringSst 待匹配的字符串
|
* @param stringSst 待匹配的字符串
|
||||||
*/
|
*/
|
||||||
protected void buildPrefixTree(Collection<String> stringSst){
|
protected void buildPrefixTree(final Collection<String> stringSst) {
|
||||||
// 节点编号 根节点已经是0了 所以从 1开始编号
|
// 节点编号 根节点已经是0了 所以从 1开始编号
|
||||||
int nodeIndex = 1;
|
int nodeIndex = 1;
|
||||||
for (String string : stringSst) {
|
for (final String string : stringSst) {
|
||||||
Node node = root;
|
Node node = root;
|
||||||
char[] charArray = string.toCharArray();
|
for (final char c : string.toCharArray()) {
|
||||||
for (int i = 0; i < charArray.length; i++) {
|
final boolean addValue = node.addValue(c, nodeIndex, charIndexMap);
|
||||||
char c = charArray[i];
|
|
||||||
boolean addValue = node.addValue(c, nodeIndex, charIndex);
|
|
||||||
if (addValue) {
|
if (addValue) {
|
||||||
nodeIndex++;
|
nodeIndex++;
|
||||||
}
|
}
|
||||||
@ -79,9 +83,9 @@ public class MultiStrFinder {
|
|||||||
* 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边
|
* 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边
|
||||||
*/
|
*/
|
||||||
protected void buildFail() {
|
protected void buildFail() {
|
||||||
LinkedList<Node> nodeQueue = new LinkedList<>();
|
final LinkedList<Node> nodeQueue = new LinkedList<>();
|
||||||
for (int i = 0; i < root.directRouter.length; i++) {
|
for (int i = 0; i < root.directRouter.length; i++) {
|
||||||
Node nextNode = root.directRouter[i];
|
final Node nextNode = root.directRouter[i];
|
||||||
if (nextNode == null) {
|
if (nextNode == null) {
|
||||||
root.directRouter[i] = root;
|
root.directRouter[i] = root;
|
||||||
continue;
|
continue;
|
||||||
@ -92,10 +96,10 @@ public class MultiStrFinder {
|
|||||||
|
|
||||||
// 进行广度优先遍历
|
// 进行广度优先遍历
|
||||||
while (!nodeQueue.isEmpty()) {
|
while (!nodeQueue.isEmpty()) {
|
||||||
Node parent = nodeQueue.removeFirst();
|
final Node parent = nodeQueue.removeFirst();
|
||||||
// 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char
|
// 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char
|
||||||
for (int i = 0; i < parent.directRouter.length; i++) {
|
for (int i = 0; i < parent.directRouter.length; i++) {
|
||||||
Node child = parent.directRouter[i];
|
final Node child = parent.directRouter[i];
|
||||||
// child 为 null 表示没有子节点
|
// child 为 null 表示没有子节点
|
||||||
if (child == null) {
|
if (child == null) {
|
||||||
parent.directRouter[i] = parent.fail.directRouter[i];
|
parent.directRouter[i] = parent.fail.directRouter[i];
|
||||||
@ -110,18 +114,19 @@ public class MultiStrFinder {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 查询匹配的字符串
|
* 查询匹配的字符串
|
||||||
|
*
|
||||||
* @param text 返回每个匹配的 字符串 value是字符首字母地址
|
* @param text 返回每个匹配的 字符串 value是字符首字母地址
|
||||||
* @return
|
* @return 匹配结果
|
||||||
*/
|
*/
|
||||||
public Map<String,List<Integer>> findMatch(String text){
|
public Map<String, List<Integer>> findMatch(final String text) {
|
||||||
// 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配
|
// 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配
|
||||||
HashMap<String, List<Integer>> resultMap = new HashMap<>();
|
final HashMap<String, List<Integer>> resultMap = new HashMap<>();
|
||||||
|
|
||||||
char[] chars = text.toCharArray();
|
final char[] chars = text.toCharArray();
|
||||||
Node currentNode = root;
|
Node currentNode = root;
|
||||||
for (int i = 0; i < chars.length; i++) {
|
for (int i = 0; i < chars.length; i++) {
|
||||||
char c = chars[i];
|
final char c = chars[i];
|
||||||
Integer index = charIndex.get(c);
|
final Integer index = charIndexMap.get(c);
|
||||||
// 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算
|
// 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算
|
||||||
if (index == null) {
|
if (index == null) {
|
||||||
currentNode = root;
|
currentNode = root;
|
||||||
@ -143,11 +148,12 @@ public class MultiStrFinder {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取字符 下标
|
* 获取字符 下标
|
||||||
* @param c
|
*
|
||||||
* @return
|
* @param c 字符
|
||||||
|
* @return 下标
|
||||||
*/
|
*/
|
||||||
protected int getIndex(char c){
|
protected int getIndex(final char c) {
|
||||||
Integer i = charIndex.get(c);
|
final Integer i = charIndexMap.get(c);
|
||||||
if (i == null) {
|
if (i == null) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -155,10 +161,6 @@ public class MultiStrFinder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static MultiStrFinder create(Collection<String> source){
|
|
||||||
return new MultiStrFinder(source);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* AC 自动机节点
|
* AC 自动机节点
|
||||||
*/
|
*/
|
||||||
@ -189,17 +191,19 @@ public class MultiStrFinder {
|
|||||||
// fail指针来源
|
// fail指针来源
|
||||||
public List<Node> failPre = new ArrayList<>();
|
public List<Node> failPre = new ArrayList<>();
|
||||||
|
|
||||||
public Node(){}
|
public Node() {
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 新增子节点
|
* 新增子节点
|
||||||
|
*
|
||||||
* @param c 字符
|
* @param c 字符
|
||||||
* @param nodeIndex 节点编号
|
* @param nodeIndex 节点编号
|
||||||
* @param charIndex 字符索引
|
* @param charIndex 字符索引
|
||||||
* @return 如果已经存在子节点 false 新增 ture
|
* @return 如果已经存在子节点 false 新增 ture
|
||||||
*/
|
*/
|
||||||
public boolean addValue(char c, int nodeIndex ,Map<Character,Integer> charIndex){
|
public boolean addValue(final char c, final int nodeIndex, final Map<Character, Integer> charIndex) {
|
||||||
Integer index = charIndex.get(c);
|
final Integer index = charIndex.get(c);
|
||||||
Node node = directRouter[index];
|
Node node = directRouter[index];
|
||||||
if (node != null) {
|
if (node != null) {
|
||||||
return false;
|
return false;
|
||||||
@ -214,21 +218,23 @@ public class MultiStrFinder {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 标记当前节点为 字符串尾节点
|
* 标记当前节点为 字符串尾节点
|
||||||
|
*
|
||||||
* @param string
|
* @param string
|
||||||
*/
|
*/
|
||||||
public void setEnd(String string){
|
public void setEnd(final String string) {
|
||||||
tagetString = string;
|
tagetString = string;
|
||||||
isEnd = true;
|
isEnd = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取下一跳
|
* 获取下一跳
|
||||||
|
*
|
||||||
* @param c 字符
|
* @param c 字符
|
||||||
* @param charIndex 字符索引
|
* @param charIndex 字符索引
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Node getNext(char c,Map<Character,Integer> charIndex){
|
public Node getNext(final char c, final Map<Character, Integer> charIndex) {
|
||||||
Integer index = charIndex.get(c);
|
final Integer index = charIndex.get(c);
|
||||||
if (index == null) {
|
if (index == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -237,11 +243,12 @@ public class MultiStrFinder {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* 构建根节点
|
* 构建根节点
|
||||||
|
*
|
||||||
* @param allCharSize 全部字符数量
|
* @param allCharSize 全部字符数量
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static Node createRoot(int allCharSize){
|
public static Node createRoot(final int allCharSize) {
|
||||||
Node node = new Node();
|
final Node node = new Node();
|
||||||
node.nodeIndex = 0;
|
node.nodeIndex = 0;
|
||||||
node.fail = node;
|
node.fail = node;
|
||||||
node.directRouter = new Node[allCharSize];
|
node.directRouter = new Node[allCharSize];
|
||||||
|
@ -14,9 +14,9 @@ import java.util.*;
|
|||||||
* 3、"abc", "bc" 会优先替换"abc"
|
* 3、"abc", "bc" 会优先替换"abc"
|
||||||
*
|
*
|
||||||
* @author newshiJ
|
* @author newshiJ
|
||||||
* @date 2024/8/2 下午3:41
|
|
||||||
*/
|
*/
|
||||||
public class HighMultiReplacerV2 extends StrReplacer {
|
public class HighMultiReplacerV2 extends StrReplacer {
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
private final AhoCorasickAutomaton ahoCorasickAutomaton;
|
private final AhoCorasickAutomaton ahoCorasickAutomaton;
|
||||||
|
|
||||||
@ -49,19 +49,17 @@ public class HighMultiReplacerV2 extends StrReplacer {
|
|||||||
protected static class AhoCorasickAutomaton extends MultiStrFinder{
|
protected static class AhoCorasickAutomaton extends MultiStrFinder{
|
||||||
protected final Map<String,String> replaceMap;
|
protected final Map<String,String> replaceMap;
|
||||||
|
|
||||||
public AhoCorasickAutomaton(Map<String,String> replaceMap){
|
public AhoCorasickAutomaton(final Map<String,String> replaceMap){
|
||||||
super(replaceMap.keySet());
|
super(replaceMap.keySet());
|
||||||
this.replaceMap = replaceMap;
|
this.replaceMap = replaceMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void replace(final CharSequence text, final StringBuilder stringBuilder){
|
public void replace(final CharSequence text, final StringBuilder stringBuilder){
|
||||||
Node currentNode = root;
|
Node currentNode = root;
|
||||||
// 临时字符串存储空间
|
// 临时字符串存储空间
|
||||||
StringBuilder temp = new StringBuilder();
|
final StringBuilder temp = new StringBuilder();
|
||||||
for (int i = 0; i < text.length(); i++) {
|
for (int i = 0; i < text.length(); i++) {
|
||||||
char ch = text.charAt(i);
|
final char ch = text.charAt(i);
|
||||||
Integer index = charIndex.get(ch);
|
final Integer index = charIndexMap.get(ch);
|
||||||
// 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换
|
// 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换
|
||||||
if(index < 0){
|
if(index < 0){
|
||||||
// 临时缓存空间中的数据写入到输出的 StringBuilder
|
// 临时缓存空间中的数据写入到输出的 StringBuilder
|
||||||
@ -94,7 +92,7 @@ public class HighMultiReplacerV2 extends StrReplacer {
|
|||||||
|
|
||||||
// 表示匹配到 现在进行字符串替换工作
|
// 表示匹配到 现在进行字符串替换工作
|
||||||
if(currentNode.isEnd){
|
if(currentNode.isEnd){
|
||||||
int length = currentNode.tagetString.length();
|
final int length = currentNode.tagetString.length();
|
||||||
// 先清理匹配到的字符 最后一个字符未加入临时空间
|
// 先清理匹配到的字符 最后一个字符未加入临时空间
|
||||||
temp.delete(temp.length() - length + 1,length - 1);
|
temp.delete(temp.length() - length + 1,length - 1);
|
||||||
if(temp.length() > 0){
|
if(temp.length() > 0){
|
||||||
|
Loading…
x
Reference in New Issue
Block a user