diff --git a/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java b/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java new file mode 100644 index 000000000..c34f8b676 --- /dev/null +++ b/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java @@ -0,0 +1,144 @@ +package cn.hutool.core.text.dfa; + +import java.util.*; + +/** + *

+ * + * 基于非确定性有穷自动机(NFA) 实现的多模匹配工具 + * + * @author renyp + */ +public class Automaton { + private final Node root; + + /** + * 默认构造 + */ + public Automaton() { + this.root = new Node(); + } + + /** + * 构造函数 并 初始化词库 + * + * @param words 添加的新词 + */ + public Automaton(String... words) { + this(); + this.insert(words); + } + + /** + * 词库添加新词,初始化查找树 + * + * @param word 添加的新词 + */ + public void insert(String word) { + Node p = root; + for (char curr : word.toCharArray()) { + int ind = curr; + if (p.next.get(ind) == null) { + p.next.put(ind, new Node()); + } + p = p.next.get(ind); + } + p.flag = true; + p.str = word; + } + + /** + * 词库批量添加新词,初始化查找树 + * + * @param words 添加的新词 + */ + public void insert(String... words) { + for (String word : words) { + this.insert(word); + } + } + + /** + * 构建基于NFA模型的 AC自动机 + */ + public void buildAc() { + Queue queue = new LinkedList<>(); + Node p = root; + for (Integer key : p.next.keySet()) { + p.next.get(key).fail = root; + queue.offer(p.next.get(key)); + } + while (!queue.isEmpty()) { + Node curr = queue.poll(); + for (Integer key : curr.next.keySet()) { + Node fail = curr.fail; + // 查找当前节点匹配失败,他对应等效匹配的节点是哪个 + while (fail != null && fail.next.get(key) == null) { + fail = fail.fail; + } + // 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配) + if (fail != null) { + fail = fail.next.get(key); + } else { + fail = root; + } + curr.next.get(key).fail = fail; + queue.offer(curr.next.get(key)); + } + } + } + + /** + * @param text 查询的文本(母串) + */ + public List find(String text) { + return this.find(text, true); + } + + /** + * @param text 查找的文本(母串) + * @param isDensityMatch 是否密集匹配 + */ + public List find(String text, boolean isDensityMatch) { + List ans = new ArrayList<>(); + Node p = root, k = null; + for (int i = 0, len = text.length(); i < len; i++) { + int ind = text.charAt(i); + // 状态转移(沿着fail指针链接的链表,此处区别于DFA模型) + while (p != null && p.next.get(ind) == null) { + p = p.fail; + } + if (p == null) { + p = root; + } else { + p = p.next.get(ind); + } + // 提取结果(沿着fail指针链接的链表,此处区别于DFA模型) + k = p; + while (k != null) { + if (k.flag) { + ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i)); + if (!isDensityMatch) { + p = root; + break; + } + } + k = k.fail; + } + } + return ans; + } + + private static class Node { + + boolean flag; + Node fail; + String str; + Map next; + + public Node() { + this.flag = false; + next = new HashMap<>(); + } + } +} diff --git a/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java b/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java new file mode 100644 index 000000000..9acdf13b9 --- /dev/null +++ b/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java @@ -0,0 +1,225 @@ +package cn.hutool.core.text.dfa; + +import cn.hutool.core.date.StopWatch; +import junit.framework.TestCase; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.stream.Collectors; + +public class AutomatonTest extends TestCase { + + /** + * 密集匹配 测试查找结果,并与WordTree对比效率 + */ + public void testFind() { + Automaton automaton = new Automaton(); + WordTree wordTree = new WordTree(); + automaton.insert("say", "her", "he", "she", "shr"); + automaton.buildAc(); + wordTree.addWords("say", "her", "he", "she", "shr"); + + StopWatch stopWatch = new StopWatch(); + String input = "sasherhsay"; + + stopWatch.start("automaton_char_find"); + List ans1 = automaton.find(input); + stopWatch.stop(); + assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex()); + assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex()); + assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex()); + assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex()); + assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex()); + assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex()); + assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex()); + assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex()); + + stopWatch.start("wordtree_char_find"); + List ans2 = wordTree.matchAll(input, -1, true, true); + stopWatch.stop(); + assertEquals("she,he,her,say", String.join(",", ans2)); + + System.out.println(stopWatch.prettyPrint()); + } + + /** + * 非密集匹配 测试查找结果,并与WordTree对比效率 + */ + public void testFindNotDensity() { + Automaton automaton = new Automaton(); + WordTree wordTree = new WordTree(); + automaton.insert("say", "her", "he", "she", "shr"); + automaton.buildAc(); + wordTree.addWords("say", "her", "he", "she", "shr"); + + StopWatch stopWatch = new StopWatch(); + String input = "sasherhsay"; + + stopWatch.start("automaton_char_find_not_density"); + List ans1 = automaton.find(input, false); + stopWatch.stop(); + assertEquals("she,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex()); + assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex()); + assertEquals(Integer.valueOf(7), ans1.get(1).getStartIndex()); + assertEquals(Integer.valueOf(9), ans1.get(1).getEndIndex()); + + stopWatch.start("wordtree_char_find_not_density"); + List ans2 = wordTree.matchAll(input, -1, false, true); + stopWatch.stop(); + assertEquals("she,say", String.join(",", ans2)); + + System.out.println(stopWatch.prettyPrint()); + } + + /** + * 密集匹配 测试建树和查找,并与WordTree对比效率 + */ + public void testBuildAndFind() { + StopWatch stopWatch = new StopWatch(); + String input = "sasherhsay"; + + stopWatch.start("automaton_char_buid_find"); + Automaton automatonLocal = new Automaton(); + automatonLocal.insert("say", "her", "he", "she", "shr"); + automatonLocal.buildAc(); + List ans1 = automatonLocal.find(input); + stopWatch.stop(); + assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + assertEquals(Integer.valueOf(2), ans1.get(0).getStartIndex()); + assertEquals(Integer.valueOf(4), ans1.get(0).getEndIndex()); + assertEquals(Integer.valueOf(3), ans1.get(1).getStartIndex()); + assertEquals(Integer.valueOf(4), ans1.get(1).getEndIndex()); + assertEquals(Integer.valueOf(3), ans1.get(2).getStartIndex()); + assertEquals(Integer.valueOf(5), ans1.get(2).getEndIndex()); + assertEquals(Integer.valueOf(7), ans1.get(3).getStartIndex()); + assertEquals(Integer.valueOf(9), ans1.get(3).getEndIndex()); + + stopWatch.start("wordtree_char_build_find"); + WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("say", "her", "he", "she", "shr"); + List ans2 = wordTreeLocal.matchAll(input, -1, true, true); + stopWatch.stop(); + assertEquals("she,he,her,say", String.join(",", ans2)); + + System.out.println(stopWatch.prettyPrint()); + } + + /** + * 密集匹配 构建树和查找 测试中文字符,并与wordTree对比效率 + */ + @Test + public void testBuildFindCnChar() { + StopWatch stopWatch = new StopWatch(); + String input = "赵啊三在做什么"; + + stopWatch.start("automaton_cn_build_find"); + Automaton automatonLocal = new Automaton(); + automatonLocal.insert("赵", "赵啊", "赵啊三"); + automatonLocal.buildAc(); + + final List result = automatonLocal.find(input); + stopWatch.stop(); + + Assert.assertEquals(3, result.size()); + Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + assertEquals(Integer.valueOf(0), result.get(0).getStartIndex()); + assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); + assertEquals(Integer.valueOf(0), result.get(1).getStartIndex()); + assertEquals(Integer.valueOf(1), result.get(1).getEndIndex()); + assertEquals(Integer.valueOf(0), result.get(2).getStartIndex()); + assertEquals(Integer.valueOf(2), result.get(2).getEndIndex()); + + stopWatch.start("wordtree_cn_build_find"); + WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); + + final List result1 = wordTreeLocal.matchAll(input, -1, true, true); + stopWatch.stop(); + + Assert.assertEquals(3, result1.size()); + Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1)); + + System.out.println(stopWatch.prettyPrint()); + + } + + /** + * 密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率 + */ + @Test + public void testFindCNChar() { + StopWatch stopWatch = new StopWatch(); + String input = "赵啊三在做什么"; + + Automaton automatonLocal = new Automaton(); + automatonLocal.insert("赵", "赵啊", "赵啊三"); + automatonLocal.buildAc(); + + stopWatch.start("automaton_cn_find"); + final List result = automatonLocal.find(input); + stopWatch.stop(); + + Assert.assertEquals(3, result.size()); + Assert.assertEquals("赵,赵啊,赵啊三", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + assertEquals(Integer.valueOf(0), result.get(0).getStartIndex()); + assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); + assertEquals(Integer.valueOf(0), result.get(1).getStartIndex()); + assertEquals(Integer.valueOf(1), result.get(1).getEndIndex()); + assertEquals(Integer.valueOf(0), result.get(2).getStartIndex()); + assertEquals(Integer.valueOf(2), result.get(2).getEndIndex()); + + WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); + + stopWatch.start("wordtree_cn_find"); + final List result1 = wordTreeLocal.matchAllWords(input, -1, true, true).stream().map(FoundWord::getWord) + .collect(Collectors.toList()); + stopWatch.stop(); + + Assert.assertEquals(3, result1.size()); + Assert.assertEquals("赵,赵啊,赵啊三", String.join(",", result1)); + + System.out.println(stopWatch.prettyPrint()); + + } + + /** + * 非密集匹配 测试构建树和查找 中文字符,并与wordTree对比效率, + */ + @Test + public void testFindCNCharNotDensity() { + StopWatch stopWatch = new StopWatch(); + String input = "赵啊三在做什么"; + + Automaton automatonLocal = new Automaton(); + automatonLocal.insert("赵", "赵啊", "赵啊三"); + automatonLocal.buildAc(); + + stopWatch.start("automaton_cn_find_not_density"); + final List result = automatonLocal.find(input, false); + stopWatch.stop(); + + Assert.assertEquals(1, result.size()); + Assert.assertEquals("赵", result.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); + assertEquals(Integer.valueOf(0), result.get(0).getStartIndex()); + assertEquals(Integer.valueOf(0), result.get(0).getEndIndex()); + + WordTree wordTreeLocal = new WordTree(); + wordTreeLocal.addWords("赵", "赵啊", "赵啊三"); + + stopWatch.start("wordtree_cn_find_not_density"); + final List result1 = + wordTreeLocal.matchAllWords(input, -1, false, true).stream().map(FoundWord::getWord) + .collect(Collectors.toList()); + stopWatch.stop(); + + Assert.assertEquals(1, result1.size()); + Assert.assertEquals("赵", String.join(",", result1)); + + System.out.println(stopWatch.prettyPrint()); + + } +}