mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-05-09 23:51:34 +08:00
feat:基于NFA模型的 AC自动机,实现多模匹配
This commit is contained in:
parent
10a16c4ee3
commit
da69cde765
144
hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java
Normal file
144
hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java
Normal file
@ -0,0 +1,144 @@
|
||||
package cn.hutool.core.text.dfa;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
*
|
||||
* 基于非确定性有穷自动机(NFA) 实现的多模匹配工具
|
||||
*
|
||||
* @author renyp
|
||||
*/
|
||||
public class Automaton {
|
||||
private final Node root;
|
||||
|
||||
/**
|
||||
* 默认构造
|
||||
*/
|
||||
public Automaton() {
|
||||
this.root = new Node();
|
||||
}
|
||||
|
||||
/**
|
||||
* 构造函数 并 初始化词库
|
||||
*
|
||||
* @param words 添加的新词
|
||||
*/
|
||||
public Automaton(String... words) {
|
||||
this();
|
||||
this.insert(words);
|
||||
}
|
||||
|
||||
/**
|
||||
* 词库添加新词,初始化查找树
|
||||
*
|
||||
* @param word 添加的新词
|
||||
*/
|
||||
public void insert(String word) {
|
||||
Node p = root;
|
||||
for (char curr : word.toCharArray()) {
|
||||
int ind = curr;
|
||||
if (p.next.get(ind) == null) {
|
||||
p.next.put(ind, new Node());
|
||||
}
|
||||
p = p.next.get(ind);
|
||||
}
|
||||
p.flag = true;
|
||||
p.str = word;
|
||||
}
|
||||
|
||||
/**
|
||||
* 词库批量添加新词,初始化查找树
|
||||
*
|
||||
* @param words 添加的新词
|
||||
*/
|
||||
public void insert(String... words) {
|
||||
for (String word : words) {
|
||||
this.insert(word);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建基于NFA模型的 AC自动机
|
||||
*/
|
||||
public void buildAc() {
|
||||
Queue<Node> queue = new LinkedList<>();
|
||||
Node p = root;
|
||||
for (Integer key : p.next.keySet()) {
|
||||
p.next.get(key).fail = root;
|
||||
queue.offer(p.next.get(key));
|
||||
}
|
||||
while (!queue.isEmpty()) {
|
||||
Node curr = queue.poll();
|
||||
for (Integer key : curr.next.keySet()) {
|
||||
Node fail = curr.fail;
|
||||
// 查找当前节点匹配失败,他对应等效匹配的节点是哪个
|
||||
while (fail != null && fail.next.get(key) == null) {
|
||||
fail = fail.fail;
|
||||
}
|
||||
// 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配)
|
||||
if (fail != null) {
|
||||
fail = fail.next.get(key);
|
||||
} else {
|
||||
fail = root;
|
||||
}
|
||||
curr.next.get(key).fail = fail;
|
||||
queue.offer(curr.next.get(key));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param text 查询的文本(母串)
|
||||
*/
|
||||
public List<FoundWord> find(String text) {
|
||||
return this.find(text, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param text 查找的文本(母串)
|
||||
* @param isDensityMatch 是否密集匹配
|
||||
*/
|
||||
public List<FoundWord> find(String text, boolean isDensityMatch) {
|
||||
List<FoundWord> ans = new ArrayList<>();
|
||||
Node p = root, k = null;
|
||||
for (int i = 0, len = text.length(); i < len; i++) {
|
||||
int ind = text.charAt(i);
|
||||
// 状态转移(沿着fail指针链接的链表,此处区别于DFA模型)
|
||||
while (p != null && p.next.get(ind) == null) {
|
||||
p = p.fail;
|
||||
}
|
||||
if (p == null) {
|
||||
p = root;
|
||||
} else {
|
||||
p = p.next.get(ind);
|
||||
}
|
||||
// 提取结果(沿着fail指针链接的链表,此处区别于DFA模型)
|
||||
k = p;
|
||||
while (k != null) {
|
||||
if (k.flag) {
|
||||
ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i));
|
||||
if (!isDensityMatch) {
|
||||
p = root;
|
||||
break;
|
||||
}
|
||||
}
|
||||
k = k.fail;
|
||||
}
|
||||
}
|
||||
return ans;
|
||||
}
|
||||
|
||||
private static class Node {
|
||||
|
||||
boolean flag;
|
||||
Node fail;
|
||||
String str;
|
||||
Map<Integer, Node> next;
|
||||
|
||||
public Node() {
|
||||
this.flag = false;
|
||||
next = new HashMap<>();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,225 @@
|
||||
package cn.hutool.core.text.dfa;
|
||||
|
||||
import cn.hutool.core.date.StopWatch;
|
||||
import junit.framework.TestCase;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class AutomatonTest extends TestCase {
|
||||
|
||||
/**
|
||||
* 密集匹配 测试查找结果,并与WordTree对比效率
|
||||
*/
|
||||
public void testFind() {
|
||||
Automaton automaton = new Automaton();
|
||||
WordTree wordTree = new WordTree();
|
||||
automaton.insert("say", "her", "he", "she", "shr");
|
||||
automaton.buildAc();
|
||||
wordTree.addWords("say", "her", "he", "she", "shr");
|
||||
|
||||
StopWatch stopWatch = new StopWatch();
|
||||
String input = "sasherhsay";
|
||||
|
||||
stopWatch.start("automaton_char_find");
|
||||
List<FoundWord> ans1 = automaton.find(input);
|
||||
stopWatch.stop();
|
||||
assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
|
||||
assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex());
|
||||
assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex());
|
||||
assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex());
|
||||
assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex());
|
||||
assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex());
|
||||
assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex());
|
||||
assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex());
|
||||
assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex());
|
||||
|
||||
stopWatch.start("wordtree_char_find");
|
||||
List<String> ans2 = wordTree.matchAll(input, -1, true, true);
|
||||
stopWatch.stop();
|
||||
assertEquals("she,he,her,say", String.join(",", ans2));
|
||||
|
||||
System.out.println(stopWatch.prettyPrint());
|
||||
}
|
||||
|
||||
/**
|
||||
* 非密集匹配 测试查找结果,并与WordTree对比效率
|
||||
*/
|
||||
public void testFindNotDensity() {
|
||||
Automaton automaton = new Automaton();
|
||||
WordTree wordTree = new WordTree();
|
||||
automaton.insert("say", "her", "he", "she", "shr");
|
||||
automaton.buildAc();
|
||||
wordTree.addWords("say", "her", "he", "she", "shr");
|
||||
|
||||
StopWatch stopWatch = new StopWatch();
|
||||
String input = "sasherhsay";
|
||||
|
||||
stopWatch.start("automaton_char_find_not_density");
|
||||
List<FoundWord> ans1 = automaton.find(input, false);
|
||||
stopWatch.stop();
|
||||
assertEquals("she,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
|
||||
assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex());
|
||||
assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex());
|
||||
assertEquals(Integer.valueOf(7), ans1.get(1).getStartIndex());
|
||||
assertEquals(Integer.valueOf(9), ans1.get(1).getEndIndex());
|
||||
|
||||
stopWatch.start("wordtree_char_find_not_density");
|
||||
List<String> ans2 = wordTree.matchAll(input, -1, false, true);
|
||||
stopWatch.stop();
|
||||
assertEquals("she,say", String.join(",", ans2));
|
||||
|
||||
System.out.println(stopWatch.prettyPrint());
|
||||
}
|
||||
|
||||
/**
|
||||
* 密集匹配 测试建树和查找,并与WordTree对比效率
|
||||
*/
|
||||
public void testBuildAndFind() {
|
||||
StopWatch stopWatch = new StopWatch();
|
||||
String input = "sasherhsay";
|
||||
|
||||
stopWatch.start("automaton_char_buid_find");
|
||||
Automaton automatonLocal = new Automaton();
|
||||
automatonLocal.insert("say", "her", "he", "she", "shr");
|
||||
automatonLocal.buildAc();
|
||||
List<FoundWord> ans1 = automatonLocal.find(input);
|
||||
stopWatch.stop();
|
||||
assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
|
||||
assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex());
|
||||
assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex());
|
||||
assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex());
|
||||
assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex());
|
||||
assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex());
|
||||
assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex());
|
||||
assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex());
|
||||
assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex());
|
||||
|
||||
stopWatch.start("wordtree_char_build_find");
|
||||
WordTree wordTreeLocal = new WordTree();
|
||||
wordTreeLocal.addWords("say", "her", "he", "she", "shr");
|
||||
List<String> ans2 = wordTreeLocal.matchAll(input, -1, true, true);
|
||||
stopWatch.stop();
|
||||
assertEquals("she,he,her,say", String.join(",", ans2));
|
||||
|
||||
System.out.println(stopWatch.prettyPrint());
|
||||
}
|
||||
|
||||
/**
|
||||
* 密集匹配 构建树和查找 测试中文字符,并与wordTree对比效率
|
||||
*/
|
||||
@Test
|
||||
public void testBuildFindCnChar() {
|
||||
StopWatch stopWatch = new StopWatch();
|
||||
String input = "赵啊三在做什么";
|
||||
|
||||
stopWatch.start("automaton_cn_build_find");
|
||||
Automaton automatonLocal = new Automaton();
|
||||
automatonLocal.insert("赵", "赵啊", "赵啊三");
|
||||
automatonLocal.buildAc();
|
||||
|
||||
final List<FoundWord> result = automatonLocal.find(input);
|
||||
stopWatch.stop();
|
||||
|
||||
Assert.assertEquals(3, result.size());
|
||||
Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
|
||||
assertEquals(Integer.valueOf(0), result.get(0).getStartIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(1).getStartIndex());
|
||||
assertEquals(Integer.valueOf(1), result.get(1).getEndIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(2).getStartIndex());
|
||||
assertEquals(Integer.valueOf(2), result.get(2).getEndIndex());
|
||||
|
||||
stopWatch.start("wordtree_cn_build_find");
|
||||
WordTree wordTreeLocal = new WordTree();
|
||||
wordTreeLocal.addWords("赵", "赵啊", "赵啊三");
|
||||
|
||||
final List<String> result1 = wordTreeLocal.matchAll(input, -1, true, true);
|
||||
stopWatch.stop();
|
||||
|
||||
Assert.assertEquals(3, result1.size());
|
||||
Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1));
|
||||
|
||||
System.out.println(stopWatch.prettyPrint());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率
|
||||
*/
|
||||
@Test
|
||||
public void testFindCNChar() {
|
||||
StopWatch stopWatch = new StopWatch();
|
||||
String input = "赵啊三在做什么";
|
||||
|
||||
Automaton automatonLocal = new Automaton();
|
||||
automatonLocal.insert("赵", "赵啊", "赵啊三");
|
||||
automatonLocal.buildAc();
|
||||
|
||||
stopWatch.start("automaton_cn_find");
|
||||
final List<FoundWord> result = automatonLocal.find(input);
|
||||
stopWatch.stop();
|
||||
|
||||
Assert.assertEquals(3, result.size());
|
||||
Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
|
||||
assertEquals(Integer.valueOf(0), result.get(0).getStartIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(1).getStartIndex());
|
||||
assertEquals(Integer.valueOf(1), result.get(1).getEndIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(2).getStartIndex());
|
||||
assertEquals(Integer.valueOf(2), result.get(2).getEndIndex());
|
||||
|
||||
WordTree wordTreeLocal = new WordTree();
|
||||
wordTreeLocal.addWords("赵", "赵啊", "赵啊三");
|
||||
|
||||
stopWatch.start("wordtree_cn_find");
|
||||
final List<String> result1 = wordTreeLocal.matchAllWords(input, -1, true, true).stream().map(FoundWord::getWord)
|
||||
.collect(Collectors.toList());
|
||||
stopWatch.stop();
|
||||
|
||||
Assert.assertEquals(3, result1.size());
|
||||
Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1));
|
||||
|
||||
System.out.println(stopWatch.prettyPrint());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 非密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率,
|
||||
*/
|
||||
@Test
|
||||
public void testFindCNCharNotDensity() {
|
||||
StopWatch stopWatch = new StopWatch();
|
||||
String input = "赵啊三在做什么";
|
||||
|
||||
Automaton automatonLocal = new Automaton();
|
||||
automatonLocal.insert("赵", "赵啊", "赵啊三");
|
||||
automatonLocal.buildAc();
|
||||
|
||||
stopWatch.start("automaton_cn_find_not_density");
|
||||
final List<FoundWord> result = automatonLocal.find(input, false);
|
||||
stopWatch.stop();
|
||||
|
||||
Assert.assertEquals(1, result.size());
|
||||
Assert.assertEquals("赵", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
|
||||
assertEquals(Integer.valueOf(0), result.get(0).getStartIndex());
|
||||
assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
|
||||
|
||||
WordTree wordTreeLocal = new WordTree();
|
||||
wordTreeLocal.addWords("赵", "赵啊", "赵啊三");
|
||||
|
||||
stopWatch.start("wordtree_cn_find_not_density");
|
||||
final List<String> result1 =
|
||||
wordTreeLocal.matchAllWords(input, -1, false, true).stream().map(FoundWord::getWord)
|
||||
.collect(Collectors.toList());
|
||||
stopWatch.stop();
|
||||
|
||||
Assert.assertEquals(1, result1.size());
|
||||
Assert.assertEquals("赵", String.join(",", result1));
|
||||
|
||||
System.out.println(stopWatch.prettyPrint());
|
||||
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user