!1251 https://gitee.com/dromara/hutool/issues/I8OJQZ AC自动机实现

Merge pull request !1251 from 好人难当/v6-dev
2025-05-09 23:51:34 +08:00 · 2024-08-10 00:27:51 +00:00 · 2024-08-10 00:27:51 +00:00 · f78f9569e6
commit f78f9569e6
parent b9b6069ded 6448ded4e7
2 changed files with 394 additions and 0 deletions
--- a/hutool-core/src/main/java/org/dromara/hutool/core/text/finder/MultiStrFinder.java
+++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/finder/MultiStrFinder.java
@ -0,0 +1,279 @@
+package org.dromara.hutool.core.text.finder;
+
+import java.util.*;
+
+/**
+ * 多字符串查询器 底层思路 使用 AC 自动机实现
+ * @author newshiJ
+ * @date 2024/8/2 上午10:07
+ */
+public class MultiStrFinder {
+
+	// 字符索引
+	protected final Map<Character,Integer> charIndex = new HashMap<>();
+
+	// 全部字符数量
+	protected final int allCharSize;
+
+	// 根节点
+	protected final Node root;
+
+	// 全部节点数量
+	int nodeSize;
+
+	/**
+	 * 构建多字符串查询器
+	 * @param source
+	 */
+	public MultiStrFinder(Collection<String> source){
+		// 待匹配的字符串
+		final Set<String> stringSet = new HashSet<>();
+
+		// 所有字符
+		final Set<Character> charSet = new HashSet<>();
+		for (String string : source) {
+			stringSet.add(string);
+			char[] charArray = string.toCharArray();
+			for (char c : charArray) {
+				charSet.add(c);
+			}
+		}
+		allCharSize = charSet.size();
+		int index = 0;
+		for (Character c : charSet) {
+			charIndex.put(c,index);
+			index ++;
+		}
+
+		root = Node.createRoot(allCharSize);
+
+		buildPrefixTree(stringSet);
+		buildFail();
+	}
+
+	/**
+	 * 构建前缀树
+	 * @param stringSst 待匹配的字符串
+	 */
+	protected void buildPrefixTree(Collection<String> stringSst){
+		// 节点编号 根节点已经是0了 所以从 1开始编号
+		int nodeIndex = 1;
+		for (String string : stringSst) {
+			Node node = root;
+			char[] charArray = string.toCharArray();
+			for (int i = 0; i < charArray.length; i++) {
+				char c = charArray[i];
+				boolean addValue = node.addValue(c, nodeIndex, charIndex);
+				if(addValue){
+					nodeIndex ++;
+				}
+				node = node.directRouter[getIndex(c)];
+			}
+			node.setEnd(string);
+		}
+		nodeSize = nodeIndex;
+	}
+
+	/**
+	 * 构建 fail指针过程
+	 * 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边
+	 */
+	protected void buildFail(){
+		LinkedList<Node> nodeQueue = new LinkedList<>();
+		for (int i = 0; i < root.directRouter.length; i++) {
+			Node nextNode = root.directRouter[i];
+			if(nextNode == null){
+				root.directRouter[i] = root;
+				continue;
+			}
+			nextNode.fail = root;
+			nodeQueue.addLast(nextNode);
+		}
+
+		// 进行广度优先遍历
+		while (!nodeQueue.isEmpty()){
+			Node parent = nodeQueue.removeFirst();
+			// 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char
+			for (int i = 0; i < parent.directRouter.length; i++) {
+				Node child = parent.directRouter[i];
+				// child 为 null 表示没有子节点
+				if(child == null){
+					parent.directRouter[i] = parent.fail.directRouter[i];
+					continue;
+				}
+				child.fail = parent.fail.directRouter[i];
+				nodeQueue.addLast(child);
+				child.fail.failPre.add(child);
+			}
+		}
+	}
+
+	/**
+	 * 查询匹配的字符串
+	 * @param text 返回每个匹配的 字符串 value是字符首字母地址
+	 * @return
+	 */
+	public Map<String,List<Integer>> findMatch(String text){
+		// 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配
+		HashMap<String, List<Integer>> resultMap = new HashMap<>();
+
+		char[] chars = text.toCharArray();
+		Node currentNode = root;
+		for (int i = 0; i < chars.length; i++) {
+			char c = chars[i];
+			Integer index = charIndex.get(c);
+			// 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算
+			if(index == null){
+				currentNode = root;
+				continue;
+			}
+			// 进入下一跳 可能是正常下一跳 也可能是fail加上后的 下一跳
+			currentNode = currentNode.directRouter[index];
+			// 判断是否尾部节点 是尾节点 说明已经匹配到了完整的字符串 将匹配结果写入返回对象
+			if(currentNode.isEnd){
+				resultMap.computeIfAbsent(currentNode.tagetString, k -> new ArrayList<>())
+					.add(i - currentNode.tagetString.length() + 1);
+			}
+
+		}
+
+		return resultMap;
+	}
+
+
+	/**
+	 * 获取字符 下标
+	 * @param c
+	 * @return
+	 */
+	protected int getIndex(char c){
+		Integer i = charIndex.get(c);
+		if(i == null){
+			return -1;
+		}
+		return i;
+	}
+
+
+	public static MultiStrFinder create(Collection<String> source){
+		return new MultiStrFinder(source);
+	}
+
+	/**
+	 * AC 自动机节点
+	 */
+	protected static class Node {
+		// 是否是字符串 尾节点
+		public boolean isEnd = false;
+
+		// 如果当前节点是尾节点 那么表示 匹配到的字符串 	其他情况下 null
+		public String tagetString;
+
+		//失效节点
+		public Node fail;
+
+		/**
+		 * 直接路由表
+		 * 减少挑 fail过程 使用数组 + charIndex 希望库减少 hash复杂度和内存空间
+		 * 当初始化 stringSet 数量较大时 字符较多可以一定程度上减少 hashMap 底层实现带来的 内存开销
+		 * directRouter 大小为 全部字符数量
+		 */
+		public Node[] directRouter;
+
+		// 节点编号 root 为 0
+		public int nodeIndex;
+
+		// 值
+		public char value;
+
+		// fail指针来源
+		public List<Node> failPre = new ArrayList<>();
+
+		public Node(){}
+
+		/**
+		 * 新增子节点
+		 * @param c 字符
+		 * @param nodeIndex 节点编号
+		 * @param charIndex 字符索引
+		 * @return 如果已经存在子节点 false 新增 ture
+		 */
+		public boolean addValue(char c, int nodeIndex ,Map<Character,Integer> charIndex){
+			Integer index = charIndex.get(c);
+			Node node = directRouter[index];
+			if(node != null){
+				return false;
+			}
+			node = new Node();
+			directRouter[index] = node;
+			node.nodeIndex = nodeIndex;
+			node.directRouter = new Node[directRouter.length];
+			node.value = c;
+			return true;
+		}
+
+		/**
+		 * 标记当前节点为 字符串尾节点
+		 * @param string
+		 */
+		public void setEnd(String string){
+			tagetString = string;
+			isEnd = true;
+		}
+
+		/**
+		 * 获取下一跳
+		 * @param c 字符
+		 * @param charIndex 字符索引
+		 * @return
+		 */
+		public Node getNext(char c,Map<Character,Integer> charIndex){
+			Integer index = charIndex.get(c);
+			if(index == null){
+				return null;
+			}
+			return directRouter[index];
+		}
+
+		/**
+		 * 构建根节点
+		 * @param allCharSize 全部字符数量
+		 * @return
+		 */
+		public static Node createRoot(int allCharSize){
+			Node node = new Node();
+			node.nodeIndex = 0;
+			node.fail = node;
+			node.directRouter = new Node[allCharSize];
+			return node;
+		}
+
+		@Override
+		public String toString() {
+			return value + ":" + nodeIndex;
+		}
+	}
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerV2.java
+++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerV2.java
@ -0,0 +1,115 @@
+package org.dromara.hutool.core.text.replacer;
+
+import org.dromara.hutool.core.text.finder.MultiStrFinder;
+
+import java.util.*;
+
+/**
+ * 高效替换器，通过查找指定关键字，替换对应的值
+ *  基于AC自动机算法实现，需要被替换的原字符串越大，替换的键值对越多，效率提升越明显
+ *  <p>
+ *  注意: 如果需要被替换的关键字出现交叉,最先匹配中的关键字会被替换
+ *  1、"abc","ab"   会优先替换"ab"
+ *  2、"abed","be"  会优先替换"abed"
+ *  3、"abc", "bc"  会优先替换"abc"
+ *
+ * @author newshiJ
+ * @date 2024/8/2 下午3:41
+ */
+public class HighMultiReplacerV2 extends StrReplacer {
+
+	private final AhoCorasickAutomaton ahoCorasickAutomaton;
+
+	/**
+	 * 构造
+	 *
+	 * @param map key为需要被查找的字符串，value为对应的替换的值
+	 */
+	public HighMultiReplacerV2(final Map<String, String> map) {
+		ahoCorasickAutomaton = new AhoCorasickAutomaton(map);
+	}
+
+	@Override
+	protected int replace(final CharSequence str, final int pos, final StringBuilder out) {
+		ahoCorasickAutomaton.replace(str, out);
+		return str.length();
+	}
+
+	@Override
+	public CharSequence apply(final CharSequence str) {
+		final StringBuilder builder = new StringBuilder();
+		replace(str, 0, builder);
+		return builder;
+	}
+
+
+	/**
+	 * AC 自动机
+	 */
+	protected static class AhoCorasickAutomaton extends MultiStrFinder{
+		protected final Map<String,String> replaceMap;
+
+		public AhoCorasickAutomaton(Map<String,String> replaceMap){
+			super(replaceMap.keySet());
+			this.replaceMap = replaceMap;
+		}
+
+
+		public void replace(final CharSequence text, final StringBuilder stringBuilder){
+			Node currentNode = root;
+			// 临时字符串存储空间
+			StringBuilder temp = new StringBuilder();
+			for (int i = 0; i < text.length(); i++) {
+				char ch = text.charAt(i);
+				Integer index = charIndex.get(ch);
+				// 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换
+				if(index < 0){
+					// 临时缓存空间中的数据写入到输出的 StringBuilder
+					if(temp.length() > 0){
+						stringBuilder.append(temp);
+						// 数据写入后清空临时空间
+						temp.delete(0, temp.length());
+					}
+					// 将一个一定不会替换的字符 ch 写入输出
+					stringBuilder.append(ch);
+					// 匹配失败 将当前节点重新指向根节点
+					currentNode = root;
+					continue;
+				}
+
+				// 这个逻辑分支表示 已经匹配到了下一跳
+				currentNode = currentNode.directRouter[index];
+
+				// 当前是root节点表示匹配中断 清理临时空间 写入到输出
+				if(currentNode.nodeIndex == 0){
+					if(temp.length() > 0){
+						stringBuilder.append(temp);
+						// 数据写入后清空临时空间
+						temp.delete(0, temp.length());
+						// 当前情况表示该字符存在在候选转换字符中 但是前一个字符到这里是不存在路径
+						stringBuilder.append(ch);
+						continue;
+					}
+				}
+
+				// 表示匹配到 现在进行字符串替换工作
+				if(currentNode.isEnd){
+					int length = currentNode.tagetString.length();
+					// 先清理匹配到的字符 最后一个字符未加入临时空间
+					temp.delete(temp.length() - length + 1,length - 1);
+					if(temp.length() > 0){
+						stringBuilder.append(temp);
+					}
+					// 写入被替换的字符串
+					stringBuilder.append(replaceMap.get(currentNode.tagetString));
+					// 因为字符串被替换过了 所以当前节点重新指向 root
+					currentNode = root;
+					continue;
+				}
+
+				temp.append(ch);
+			}
+		}
+	}
+
+}