feat:基于AC自动机算法实现的高效替换器

This commit is contained in:
kyao 2023-08-30 11:05:20 +08:00
parent 07de6aa4f0
commit a53e994a84
2 changed files with 244 additions and 0 deletions

View File

@ -0,0 +1,212 @@
/*
* Copyright (c) 2023 looly(loolly@aliyun.com)
* Hutool is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* https://license.coscl.org.cn/MulanPSL2
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
*/
package org.dromara.hutool.core.text.replacer;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
/**
* 高效替换器通过查找指定关键字替换对应的值
* 基于AC自动机算法实现需要被替换的原字符串越大替换的键值对越多效率提升越明显
* <p>
* 注意: 如果需要被替换的关键字出现交叉,最先匹配中的关键字会被替换
* 1"abc","ab" 会优先替换"ab"
* 2"abed","be" 会优先替换"abed"
* 3"abc", "bc" 会优先替换"abc"
*
* @author kyao
*/
public class HighMultiReplacer extends StrReplacer {
private final AhoCorasickAutomaton ahoCorasickAutomaton;
/**
* 构造
*
* @param map key为需要被查找的字符串value为对应的替换的值
*/
public HighMultiReplacer(Map<String, String> map) {
ahoCorasickAutomaton = new AhoCorasickAutomaton(map);
}
/**
* 生成一个HighMultiReplacer对象
*
* @param map key为需要被查找的字符串value为对应的替换的值
* @return org.dromara.hutool.core.text.replacer.HighMultiReplacer
*/
public static HighMultiReplacer of(Map<String, String> map) {
return new HighMultiReplacer(map);
}
@Override
protected int replace(CharSequence str, int pos, StringBuilder out) {
ahoCorasickAutomaton.replace(str, out);
return str.length();
}
@Override
public CharSequence apply(final CharSequence str) {
StringBuilder builder = new StringBuilder();
replace(str, 0, builder);
return builder;
}
/**
* AC自动机
*/
private static class AhoCorasickAutomaton {
/*AC自动机的根结点根结点不存储任何字符信息*/
private final Node root;
/*待查找的目标字符串集合*/
private final Map<String, String> target;
/**
* @param target 待查找的目标字符串集合
*/
public AhoCorasickAutomaton(Map<String, String> target) {
root = new Node();
this.target = target;
buildTrieTree();
buildAcFromTrie();
}
/**
* 用于表示AC自动机的每个结点在每个结点中我们并没有存储该结点对应的字符
*/
private static class Node {
/*如果该结点是一个终点从根结点到此结点表示了一个目标字符串则str != null, 且str就表示该字符串*/
String str;
/*该节点下的子节点*/
Map<Character, Node> children = new HashMap<>();
/*当前结点的孩子结点不能匹配文本串中的某个字符时,下一个应该查找的结点*/
Node fail;
public boolean isWord() {
return str != null;
}
}
/**
* 由目标字符串构建Trie树
*/
private void buildTrieTree() {
for (String targetStr : target.keySet()) {
Node curr = root;
if (targetStr == null) {
continue;
}
for (int i = 0; i < targetStr.length(); i++) {
char ch = targetStr.charAt(i);
Node node = curr.children.get(ch);
if (node == null) {
node = new Node();
curr.children.put(ch, node);
}
curr = node;
}
/*将每个目标字符串的最后一个字符对应的结点变成终点*/
curr.str = targetStr;
}
}
/**
* 由Trie树构建AC自动机本质是一个自动机相当于构建KMP算法的next数组
*/
private void buildAcFromTrie() {
/*广度优先遍历所使用的队列*/
LinkedList<Node> queue = new LinkedList<>();
/*单独处理根结点的所有孩子结点*/
for (Node x : root.children.values()) {
/*根结点的所有孩子结点的fail都指向根结点*/
x.fail = root;
queue.addLast(x);/*所有根结点的孩子结点入列*/
}
while (!queue.isEmpty()) {
/*确定出列结点的所有孩子结点的fail的指向*/
Node p = queue.removeFirst();
for (Map.Entry<Character, Node> entry : p.children.entrySet()) {
/*孩子结点入列*/
queue.addLast(entry.getValue());
/*从p.fail开始找起*/
Node failTo = p.fail;
while (true) {
/*说明找到了根结点还没有找到*/
if (failTo == null) {
entry.getValue().fail = root;
break;
}
/*说明有公共前缀*/
if (failTo.children.get(entry.getKey()) != null) {
entry.getValue().fail = failTo.children.get(entry.getKey());
break;
} else {/*继续向上寻找*/
failTo = failTo.fail;
}
}
}
}
}
/**
* 在文本串中替换所有的目标字符串
*
* @param text 被替换的目标字符串
* @param stringBuilder 替换后的结果
*/
public void replace(CharSequence text, StringBuilder stringBuilder) {
Node curr = root;
int i = 0;
while (i < text.length()) {
/*文本串中的字符*/
char ch = text.charAt(i);
/*文本串中的字符和AC自动机中的字符进行比较*/
Node node = curr.children.get(ch);
if (node != null) {
stringBuilder.append(ch);
/*若相等,自动机进入下一状态*/
curr = node;
if (curr.isWord()) {
stringBuilder.delete(stringBuilder.length() - curr.str.length(), stringBuilder.length());
stringBuilder.append(target.get(curr.str));
curr = root;
}
/*索引自增,指向下一个文本串中的字符*/
i++;
} else {
/*若不等,找到下一个应该比较的状态*/
curr = curr.fail;
/*到根结点还未找到说明文本串中以ch作为结束的字符片段不是任何目标字符串的前缀状态机重置比较下一个字符*/
if (curr == null) {
stringBuilder.append(ch);
curr = root;
i++;
}
}
}
}
}
}

View File

@ -0,0 +1,32 @@
package org.dromara.hutool.core.text.replacer;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.HashMap;
import java.util.Map;
public class HighMultiReplacerTest {
@Test
public void replaceTest() {
Map<String, String> map = new HashMap<>();
map.put("abc", "bar");
map.put("123", "1234");
map.put("12", "哈哈哈");
map.put("bc", "67868");
map.put("你好", "hello world");
map.put("AAACC", "%^$%$");
map.put("_啊", "qqqq");
HighMultiReplacer replacer = HighMultiReplacer.of(map);
Assertions.assertEquals("bar哈哈哈hello world4", replacer.apply("abc12你好4").toString());
Assertions.assertEquals("qqqq啊qqqq-啊", replacer.apply("_啊啊_啊-啊").toString());
Assertions.assertEquals("哈哈哈3456789", replacer.apply("123456789").toString());
Assertions.assertEquals("AAAC67868", replacer.apply("AAACbc").toString());
Assertions.assertEquals("哈哈哈哈哈1哈哈", replacer.apply("哈哈121哈哈").toString());
Assertions.assertEquals("你hello world好%^$%$CACAC", replacer.apply("你你好好AAACCCACAC").toString());
Assertions.assertEquals("哈哈哈3", replacer.apply("123").toString());
Assertions.assertEquals("---11", replacer.apply("---11").toString());
Assertions.assertEquals("", replacer.apply("").toString());
}
}