This commit is contained in:
Looly 2023-03-15 09:12:56 +08:00
parent ee55467690
commit 50178ee1ab
3 changed files with 250 additions and 246 deletions

View File

@ -9,13 +9,13 @@ import java.util.*;
* *
* @author renyp * @author renyp
*/ */
public class Automaton { public class NFA {
private final Node root; private final Node root;
/** /**
* 默认构造 * 默认构造
*/ */
public Automaton() { public NFA() {
this.root = new Node(); this.root = new Node();
} }
@ -24,7 +24,7 @@ public class Automaton {
* *
* @param words 添加的新词 * @param words 添加的新词
*/ */
public Automaton(String... words) { public NFA(final String... words) {
this(); this();
this.insert(words); this.insert(words);
} }
@ -34,14 +34,13 @@ public class Automaton {
* *
* @param word 添加的新词 * @param word 添加的新词
*/ */
public void insert(String word) { public void insert(final String word) {
Node p = root; Node p = root;
for (char curr : word.toCharArray()) { for (final char curr : word.toCharArray()) {
int ind = curr; if (p.next.get((int) curr) == null) {
if (p.next.get(ind) == null) { p.next.put((int) curr, new Node());
p.next.put(ind, new Node());
} }
p = p.next.get(ind); p = p.next.get((int) curr);
} }
p.flag = true; p.flag = true;
p.str = word; p.str = word;
@ -52,8 +51,8 @@ public class Automaton {
* *
* @param words 添加的新词 * @param words 添加的新词
*/ */
public void insert(String... words) { public void insert(final String... words) {
for (String word : words) { for (final String word : words) {
this.insert(word); this.insert(word);
} }
} }
@ -62,15 +61,15 @@ public class Automaton {
* 构建基于NFA模型的 AC自动机 * 构建基于NFA模型的 AC自动机
*/ */
public void buildAc() { public void buildAc() {
Queue<Node> queue = new LinkedList<>(); final Queue<Node> queue = new LinkedList<>();
Node p = root; final Node p = root;
for (Integer key : p.next.keySet()) { for (final Integer key : p.next.keySet()) {
p.next.get(key).fail = root; p.next.get(key).fail = root;
queue.offer(p.next.get(key)); queue.offer(p.next.get(key));
} }
while (!queue.isEmpty()) { while (!queue.isEmpty()) {
Node curr = queue.poll(); final Node curr = queue.poll();
for (Integer key : curr.next.keySet()) { for (final Integer key : curr.next.keySet()) {
Node fail = curr.fail; Node fail = curr.fail;
// 查找当前节点匹配失败他对应等效匹配的节点是哪个 // 查找当前节点匹配失败他对应等效匹配的节点是哪个
while (fail != null && fail.next.get(key) == null) { while (fail != null && fail.next.get(key) == null) {
@ -90,20 +89,22 @@ public class Automaton {
/** /**
* @param text 查询的文本母串 * @param text 查询的文本母串
* @return 关键字列表
*/ */
public List<FoundWord> find(String text) { public List<FoundWord> find(final String text) {
return this.find(text, true); return this.find(text, true);
} }
/** /**
* @param text 查找的文本母串 * @param text 查找的文本母串
* @param isDensityMatch 是否密集匹配 * @param isDensityMatch 是否密集匹配
* @return 关键字列表
*/ */
public List<FoundWord> find(String text, boolean isDensityMatch) { public List<FoundWord> find(final String text, final boolean isDensityMatch) {
List<FoundWord> ans = new ArrayList<>(); final List<FoundWord> ans = new ArrayList<>();
Node p = root, k = null; Node p = root, k;
for (int i = 0, len = text.length(); i < len; i++) { for (int i = 0, len = text.length(); i < len; i++) {
int ind = text.charAt(i); final int ind = text.charAt(i);
// 状态转移(沿着fail指针链接的链表此处区别于DFA模型) // 状态转移(沿着fail指针链接的链表此处区别于DFA模型)
while (p != null && p.next.get(ind) == null) { while (p != null && p.next.get(ind) == null) {
p = p.fail; p = p.fail;

View File

@ -1,225 +0,0 @@
package cn.hutool.core.text.dfa;
import cn.hutool.core.date.StopWatch;
import junit.framework.TestCase;
import org.junit.Assert;
import org.junit.Test;
import java.util.List;
import java.util.stream.Collectors;
public class AutomatonTest extends TestCase {
/**
* 密集匹配 测试查找结果并与WordTree对比效率
*/
public void testFind() {
Automaton automaton = new Automaton();
WordTree wordTree = new WordTree();
automaton.insert("say", "her", "he", "she", "shr");
automaton.buildAc();
wordTree.addWords("say", "her", "he", "she", "shr");
StopWatch stopWatch = new StopWatch();
String input = "sasherhsay";
stopWatch.start("automaton_char_find");
List<FoundWord> ans1 = automaton.find(input);
stopWatch.stop();
assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex());
assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex());
assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex());
assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex());
assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex());
assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex());
assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex());
assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex());
stopWatch.start("wordtree_char_find");
List<String> ans2 = wordTree.matchAll(input, -1, true, true);
stopWatch.stop();
assertEquals("she,he,her,say", String.join(",", ans2));
System.out.println(stopWatch.prettyPrint());
}
/**
* 非密集匹配 测试查找结果并与WordTree对比效率
*/
public void testFindNotDensity() {
Automaton automaton = new Automaton();
WordTree wordTree = new WordTree();
automaton.insert("say", "her", "he", "she", "shr");
automaton.buildAc();
wordTree.addWords("say", "her", "he", "she", "shr");
StopWatch stopWatch = new StopWatch();
String input = "sasherhsay";
stopWatch.start("automaton_char_find_not_density");
List<FoundWord> ans1 = automaton.find(input, false);
stopWatch.stop();
assertEquals("she,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex());
assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex());
assertEquals(Integer.valueOf(7), ans1.get(1).getStartIndex());
assertEquals(Integer.valueOf(9), ans1.get(1).getEndIndex());
stopWatch.start("wordtree_char_find_not_density");
List<String> ans2 = wordTree.matchAll(input, -1, false, true);
stopWatch.stop();
assertEquals("she,say", String.join(",", ans2));
System.out.println(stopWatch.prettyPrint());
}
/**
* 密集匹配 测试建树和查找并与WordTree对比效率
*/
public void testBuildAndFind() {
StopWatch stopWatch = new StopWatch();
String input = "sasherhsay";
stopWatch.start("automaton_char_buid_find");
Automaton automatonLocal = new Automaton();
automatonLocal.insert("say", "her", "he", "she", "shr");
automatonLocal.buildAc();
List<FoundWord> ans1 = automatonLocal.find(input);
stopWatch.stop();
assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex());
assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex());
assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex());
assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex());
assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex());
assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex());
assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex());
assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex());
stopWatch.start("wordtree_char_build_find");
WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("say", "her", "he", "she", "shr");
List<String> ans2 = wordTreeLocal.matchAll(input, -1, true, true);
stopWatch.stop();
assertEquals("she,he,her,say", String.join(",", ans2));
System.out.println(stopWatch.prettyPrint());
}
/**
* 密集匹配 构建树和查找 测试中文字符并与wordTree对比效率
*/
@Test
public void testBuildFindCnChar() {
StopWatch stopWatch = new StopWatch();
String input = "赵啊三在做什么";
stopWatch.start("automaton_cn_build_find");
Automaton automatonLocal = new Automaton();
automatonLocal.insert("", "赵啊", "赵啊三");
automatonLocal.buildAc();
final List<FoundWord> result = automatonLocal.find(input);
stopWatch.stop();
Assert.assertEquals(3, result.size());
Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
assertEquals(Integer.valueOf(0), result.get(0).getStartIndex());
assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
assertEquals(Integer.valueOf(0), result.get(1).getStartIndex());
assertEquals(Integer.valueOf(1), result.get(1).getEndIndex());
assertEquals(Integer.valueOf(0), result.get(2).getStartIndex());
assertEquals(Integer.valueOf(2), result.get(2).getEndIndex());
stopWatch.start("wordtree_cn_build_find");
WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("", "赵啊", "赵啊三");
final List<String> result1 = wordTreeLocal.matchAll(input, -1, true, true);
stopWatch.stop();
Assert.assertEquals(3, result1.size());
Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1));
System.out.println(stopWatch.prettyPrint());
}
/**
* 密集匹配 测试构建树和查找 中文字符并与wordTree对比效率
*/
@Test
public void testFindCNChar() {
StopWatch stopWatch = new StopWatch();
String input = "赵啊三在做什么";
Automaton automatonLocal = new Automaton();
automatonLocal.insert("", "赵啊", "赵啊三");
automatonLocal.buildAc();
stopWatch.start("automaton_cn_find");
final List<FoundWord> result = automatonLocal.find(input);
stopWatch.stop();
Assert.assertEquals(3, result.size());
Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
assertEquals(Integer.valueOf(0), result.get(0).getStartIndex());
assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
assertEquals(Integer.valueOf(0), result.get(1).getStartIndex());
assertEquals(Integer.valueOf(1), result.get(1).getEndIndex());
assertEquals(Integer.valueOf(0), result.get(2).getStartIndex());
assertEquals(Integer.valueOf(2), result.get(2).getEndIndex());
WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("", "赵啊", "赵啊三");
stopWatch.start("wordtree_cn_find");
final List<String> result1 = wordTreeLocal.matchAllWords(input, -1, true, true).stream().map(FoundWord::getWord)
.collect(Collectors.toList());
stopWatch.stop();
Assert.assertEquals(3, result1.size());
Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1));
System.out.println(stopWatch.prettyPrint());
}
/**
* 非密集匹配 测试构建树和查找 中文字符并与wordTree对比效率
*/
@Test
public void testFindCNCharNotDensity() {
StopWatch stopWatch = new StopWatch();
String input = "赵啊三在做什么";
Automaton automatonLocal = new Automaton();
automatonLocal.insert("", "赵啊", "赵啊三");
automatonLocal.buildAc();
stopWatch.start("automaton_cn_find_not_density");
final List<FoundWord> result = automatonLocal.find(input, false);
stopWatch.stop();
Assert.assertEquals(1, result.size());
Assert.assertEquals("", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
assertEquals(Integer.valueOf(0), result.get(0).getStartIndex());
assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("", "赵啊", "赵啊三");
stopWatch.start("wordtree_cn_find_not_density");
final List<String> result1 =
wordTreeLocal.matchAllWords(input, -1, false, true).stream().map(FoundWord::getWord)
.collect(Collectors.toList());
stopWatch.stop();
Assert.assertEquals(1, result1.size());
Assert.assertEquals("", String.join(",", result1));
System.out.println(stopWatch.prettyPrint());
}
}

View File

@ -0,0 +1,228 @@
package cn.hutool.core.text.dfa;
import cn.hutool.core.date.StopWatch;
import org.junit.Assert;
import org.junit.Test;
import java.util.List;
import java.util.stream.Collectors;
public class NFATest {
/**
* 密集匹配 测试查找结果并与WordTree对比效率
*/
@Test
public void testFind() {
final NFA NFA = new NFA();
NFA.insert("say", "her", "he", "she", "shr");
NFA.buildAc();
final WordTree wordTree = new WordTree();
wordTree.addWords("say", "her", "he", "she", "shr");
final StopWatch stopWatch = new StopWatch();
final String input = "sasherhsay";
stopWatch.start("automaton_char_find");
final List<FoundWord> ans1 = NFA.find(input);
stopWatch.stop();
Assert.assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
Assert.assertEquals(2, ans1.get(0).getBeginIndex().intValue());
Assert.assertEquals(4, ans1.get(0).getEndIndex().intValue());
Assert.assertEquals(3, ans1.get(1).getBeginIndex().intValue());
Assert.assertEquals(4, ans1.get(1).getEndIndex().intValue());
Assert.assertEquals(3, ans1.get(2).getBeginIndex().intValue());
Assert.assertEquals(5, ans1.get(2).getEndIndex().intValue());
Assert.assertEquals(7, ans1.get(3).getBeginIndex().intValue());
Assert.assertEquals(9, ans1.get(3).getEndIndex().intValue());
stopWatch.start("wordtree_char_find");
final List<String> ans2 = wordTree.matchAll(input, -1, true, true);
stopWatch.stop();
Assert.assertEquals("she,he,her,say", String.join(",", ans2));
//Console.log(stopWatch.prettyPrint());
}
/**
* 非密集匹配 测试查找结果并与WordTree对比效率
*/
@Test
public void testFindNotDensity() {
final NFA NFA = new NFA();
NFA.insert("say", "her", "he", "she", "shr");
NFA.buildAc();
final WordTree wordTree = new WordTree();
wordTree.addWords("say", "her", "he", "she", "shr");
final StopWatch stopWatch = new StopWatch();
final String input = "sasherhsay";
stopWatch.start("automaton_char_find_not_density");
final List<FoundWord> ans1 = NFA.find(input, false);
stopWatch.stop();
Assert.assertEquals("she,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
Assert.assertEquals(2, ans1.get(0).getBeginIndex().intValue());
Assert.assertEquals(4, ans1.get(0).getEndIndex().intValue());
Assert.assertEquals(7, ans1.get(1).getBeginIndex().intValue());
Assert.assertEquals(9, ans1.get(1).getEndIndex().intValue());
stopWatch.start("wordtree_char_find_not_density");
final List<String> ans2 = wordTree.matchAll(input, -1, false, true);
stopWatch.stop();
Assert.assertEquals("she,say", String.join(",", ans2));
//Console.log(stopWatch.prettyPrint());
}
/**
* 密集匹配 测试建树和查找并与WordTree对比效率
*/
@Test
public void testBuildAndFind() {
final StopWatch stopWatch = new StopWatch();
final String input = "sasherhsay";
stopWatch.start("automaton_char_buid_find");
final NFA NFALocal = new NFA();
NFALocal.insert("say", "her", "he", "she", "shr");
NFALocal.buildAc();
final List<FoundWord> ans1 = NFALocal.find(input);
stopWatch.stop();
Assert.assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
Assert.assertEquals(2, ans1.get(0).getBeginIndex().intValue());
Assert.assertEquals(4, ans1.get(0).getEndIndex().intValue());
Assert.assertEquals(3, ans1.get(1).getBeginIndex().intValue());
Assert.assertEquals(4, ans1.get(1).getEndIndex().intValue());
Assert.assertEquals(3, ans1.get(2).getBeginIndex().intValue());
Assert.assertEquals(5, ans1.get(2).getEndIndex().intValue());
Assert.assertEquals(7, ans1.get(3).getBeginIndex().intValue());
Assert.assertEquals(9, ans1.get(3).getEndIndex().intValue());
stopWatch.start("wordtree_char_build_find");
final WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("say", "her", "he", "she", "shr");
final List<String> ans2 = wordTreeLocal.matchAll(input, -1, true, true);
stopWatch.stop();
Assert.assertEquals("she,he,her,say", String.join(",", ans2));
//Console.log(stopWatch.prettyPrint());
}
/**
* 密集匹配 构建树和查找 测试中文字符并与wordTree对比效率
*/
@Test
public void buildFindCnCharTest() {
final StopWatch stopWatch = new StopWatch();
final String input = "赵啊三在做什么";
stopWatch.start("automaton_cn_build_find");
final NFA NFALocal = new NFA();
NFALocal.insert("", "赵啊", "赵啊三");
NFALocal.buildAc();
final List<FoundWord> result = NFALocal.find(input);
stopWatch.stop();
Assert.assertEquals(3, result.size());
Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
Assert.assertEquals(Integer.valueOf(0), result.get(0).getBeginIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(1).getBeginIndex());
Assert.assertEquals(Integer.valueOf(1), result.get(1).getEndIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(2).getBeginIndex());
Assert.assertEquals(Integer.valueOf(2), result.get(2).getEndIndex());
stopWatch.start("wordtree_cn_build_find");
final WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("", "赵啊", "赵啊三");
final List<String> result1 = wordTreeLocal.matchAll(input, -1, true, true);
stopWatch.stop();
Assert.assertEquals(3, result1.size());
Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1));
//Console.log(stopWatch.prettyPrint());
}
/**
* 密集匹配 测试构建树和查找 中文字符并与wordTree对比效率
*/
@Test
public void testFindCNChar() {
final StopWatch stopWatch = new StopWatch();
final String input = "赵啊三在做什么";
final NFA NFALocal = new NFA();
NFALocal.insert("", "赵啊", "赵啊三");
NFALocal.buildAc();
stopWatch.start("automaton_cn_find");
final List<FoundWord> result = NFALocal.find(input);
stopWatch.stop();
Assert.assertEquals(3, result.size());
Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
Assert.assertEquals(Integer.valueOf(0), result.get(0).getBeginIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(1).getBeginIndex());
Assert.assertEquals(Integer.valueOf(1), result.get(1).getEndIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(2).getBeginIndex());
Assert.assertEquals(Integer.valueOf(2), result.get(2).getEndIndex());
final WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("", "赵啊", "赵啊三");
stopWatch.start("wordtree_cn_find");
final List<String> result1 = wordTreeLocal.matchAllWords(input, -1, true, true).stream().map(FoundWord::getWord)
.collect(Collectors.toList());
stopWatch.stop();
Assert.assertEquals(3, result1.size());
Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1));
//Console.log(stopWatch.prettyPrint());
}
/**
* 非密集匹配 测试构建树和查找 中文字符并与wordTree对比效率
*/
@Test
public void testFindCNCharNotDensity() {
final StopWatch stopWatch = new StopWatch();
final String input = "赵啊三在做什么";
final NFA NFALocal = new NFA();
NFALocal.insert("", "赵啊", "赵啊三");
NFALocal.buildAc();
stopWatch.start("automaton_cn_find_not_density");
final List<FoundWord> result = NFALocal.find(input, false);
stopWatch.stop();
Assert.assertEquals(1, result.size());
Assert.assertEquals("", result.stream().map(FoundWord::getWord).collect(Collectors.joining(",")));
Assert.assertEquals(Integer.valueOf(0), result.get(0).getBeginIndex());
Assert.assertEquals(Integer.valueOf(0), result.get(0).getEndIndex());
final WordTree wordTreeLocal = new WordTree();
wordTreeLocal.addWords("", "赵啊", "赵啊三");
stopWatch.start("wordtree_cn_find_not_density");
final List<String> result1 =
wordTreeLocal.matchAllWords(input, -1, false, true).stream().map(FoundWord::getWord)
.collect(Collectors.toList());
stopWatch.stop();
Assert.assertEquals(1, result1.size());
Assert.assertEquals("", String.join(",", result1));
//Console.log(stopWatch.prettyPrint());
}
}