From 00e9af4ffb6fe1b354f2c492823845180380257e Mon Sep 17 00:00:00 2001 From: renyp Date: Thu, 16 Mar 2023 20:40:36 +0800 Subject: [PATCH] =?UTF-8?q?fix=EF=BC=9A=E9=81=BF=E5=85=8D=E8=B0=83?= =?UTF-8?q?=E7=94=A8=E6=96=B9=20=E6=98=BE=E7=A4=BA=E8=B0=83=E7=94=A8API=20?= =?UTF-8?q?=E8=A7=A6=E5=8F=91=E6=9F=A5=E6=89=BE=E6=A0=91=20=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9B=E5=B9=B6=E9=80=9A=E8=BF=87=E5=86=85=E7=BD=AE?= =?UTF-8?q?=E9=94=81=EF=BC=8C=E9=81=BF=E5=85=8D=E5=9B=A0=E5=B9=B6=E8=A1=8C?= =?UTF-8?q?=E6=A0=91=E4=BC=98=E5=8C=96=20=E5=8F=AF=E8=83=BD=E9=80=A0?= =?UTF-8?q?=E6=88=90=E7=9A=84=E4=B8=8D=E5=8F=AF=E9=A2=84=E7=9F=A5=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=20=E5=92=8C=20=E6=97=A0=E6=95=88=E9=87=8D=E5=A4=8D?= =?UTF-8?q?=E7=9A=84=20=E6=A0=91=E4=BC=98=E5=8C=96=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cn/hutool/core/text/dfa/Automaton.java | 280 ++++++++++-------- .../hutool/core/text/dfa/AutomatonTest.java | 12 +- 2 files changed, 164 insertions(+), 128 deletions(-) diff --git a/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java b/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java index c34f8b676..ce5c0b134 100644 --- a/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java +++ b/hutool-core/src/main/java/cn/hutool/core/text/dfa/Automaton.java @@ -4,141 +4,177 @@ import java.util.*; /** *

- * * 基于非确定性有穷自动机(NFA) 实现的多模匹配工具 + *

* * @author renyp */ public class Automaton { - private final Node root; + /** + * AC树的根节点 + */ + private final Node root; + /** + * 标记是否需要构建AC自动机,做树优化 + */ + private volatile boolean needBuildAC; - /** - * 默认构造 - */ - public Automaton() { - this.root = new Node(); - } + /** + * 内置锁,防止并发场景,并行建AC树,造成不可预知结果 + */ + private final Object lock; - /** - * 构造函数 并 初始化词库 - * - * @param words 添加的新词 - */ - public Automaton(String... words) { - this(); - this.insert(words); - } + /** + * 默认构造 + */ + public Automaton() { + this.root = new Node(); + this.needBuildAC = true; + this.lock = new Object(); + } - /** - * 词库添加新词,初始化查找树 - * - * @param word 添加的新词 - */ - public void insert(String word) { - Node p = root; - for (char curr : word.toCharArray()) { - int ind = curr; - if (p.next.get(ind) == null) { - p.next.put(ind, new Node()); - } - p = p.next.get(ind); - } - p.flag = true; - p.str = word; - } + /** + * 构造函数 并 初始化词库 + * + * @param words 添加的新词 + */ + public Automaton(String... words) { + this(); + this.insert(words); + } - /** - * 词库批量添加新词,初始化查找树 - * - * @param words 添加的新词 - */ - public void insert(String... words) { - for (String word : words) { - this.insert(word); - } - } + /** + * 词库添加新词,初始化查找树 + * + * @param word 添加的新词 + */ + public void insert(String word) { + needBuildAC = true; + Node p = root; + for (char curr : word.toCharArray()) { + int ind = curr; + if (p.next.get(ind) == null) { + p.next.put(ind, new Node()); + } + p = p.next.get(ind); + } + p.flag = true; + p.str = word; + } - /** - * 构建基于NFA模型的 AC自动机 - */ - public void buildAc() { - Queue queue = new LinkedList<>(); - Node p = root; - for (Integer key : p.next.keySet()) { - p.next.get(key).fail = root; - queue.offer(p.next.get(key)); - } - while (!queue.isEmpty()) { - Node curr = queue.poll(); - for (Integer key : curr.next.keySet()) { - Node fail = curr.fail; - // 查找当前节点匹配失败,他对应等效匹配的节点是哪个 - while (fail != null && fail.next.get(key) == null) { - fail = fail.fail; - } - // 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配) - if (fail != null) { - fail = fail.next.get(key); - } else { - fail = root; - } - curr.next.get(key).fail = fail; - queue.offer(curr.next.get(key)); - } - } - } + /** + * 词库批量添加新词,初始化查找树 + * + * @param words 添加的新词 + */ + public void insert(String... words) { + for (String word : words) { + this.insert(word); + } + } - /** - * @param text 查询的文本(母串) - */ - public List find(String text) { - return this.find(text, true); - } + /** + * 构建基于NFA模型的 AC自动机 + */ + private void buildAc() { + Queue queue = new LinkedList<>(); + Node p = root; + for (Integer key : p.next.keySet()) { + p.next.get(key).fail = root; + queue.offer(p.next.get(key)); + } + while (!queue.isEmpty()) { + Node curr = queue.poll(); + for (Integer key : curr.next.keySet()) { + Node fail = curr.fail; + // 查找当前节点匹配失败,他对应等效匹配的节点是哪个 + while (fail != null && fail.next.get(key) == null) { + fail = fail.fail; + } + // 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配) + if (fail != null) { + fail = fail.next.get(key); + } else { + fail = root; + } + curr.next.get(key).fail = fail; + queue.offer(curr.next.get(key)); + } + } + needBuildAC = false; + } - /** - * @param text 查找的文本(母串) - * @param isDensityMatch 是否密集匹配 - */ - public List find(String text, boolean isDensityMatch) { - List ans = new ArrayList<>(); - Node p = root, k = null; - for (int i = 0, len = text.length(); i < len; i++) { - int ind = text.charAt(i); - // 状态转移(沿着fail指针链接的链表,此处区别于DFA模型) - while (p != null && p.next.get(ind) == null) { - p = p.fail; - } - if (p == null) { - p = root; - } else { - p = p.next.get(ind); - } - // 提取结果(沿着fail指针链接的链表,此处区别于DFA模型) - k = p; - while (k != null) { - if (k.flag) { - ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i)); - if (!isDensityMatch) { - p = root; - break; - } - } - k = k.fail; - } - } - return ans; - } + /** + * @param text 查询的文本(母串) + */ + public List find(String text) { + return this.find(text, true); + } - private static class Node { + /** + * @param text 查找的文本(母串) + * @param isDensityMatch 是否密集匹配 + */ + public List find(String text, boolean isDensityMatch) { + // double check,防止重复无用的 buildAC + if (needBuildAC) { + synchronized (lock) { + if (needBuildAC) { + this.buildAc(); + } + } + } + List ans = new ArrayList<>(); + Node p = root, k = null; + for (int i = 0, len = text.length(); i < len; i++) { + int ind = text.charAt(i); + // 状态转移(沿着fail指针链接的链表,此处区别于DFA模型) + while (p != null && p.next.get(ind) == null) { + p = p.fail; + } + if (p == null) { + p = root; + } else { + p = p.next.get(ind); + } + // 提取结果(沿着fail指针链接的链表,此处区别于DFA模型) + k = p; + while (k != null) { + if (k.flag) { + ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i)); + if (!isDensityMatch) { + p = root; + break; + } + } + k = k.fail; + } + } + return ans; + } - boolean flag; - Node fail; - String str; - Map next; + private static class Node { - public Node() { - this.flag = false; - next = new HashMap<>(); - } - } + /** + * 当前节点是否是一个单词的结尾 + */ + boolean flag; + /** + * 指向 当前节点匹配失败应该跳转的下个节点 + */ + Node fail; + /** + * 以当前节点结尾的单词 + */ + String str; + /** + * 当前节点的子节点 + */ + Map next; + + public Node() { + this.flag = false; + next = new HashMap<>(); + } + } } diff --git a/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java b/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java index 9acdf13b9..763a6d881 100644 --- a/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java +++ b/hutool-core/src/test/java/cn/hutool/core/text/dfa/AutomatonTest.java @@ -17,7 +17,7 @@ public class AutomatonTest extends TestCase { Automaton automaton = new Automaton(); WordTree wordTree = new WordTree(); automaton.insert("say", "her", "he", "she", "shr"); - automaton.buildAc(); +// automaton.buildAc(); wordTree.addWords("say", "her", "he", "she", "shr"); StopWatch stopWatch = new StopWatch(); @@ -51,7 +51,7 @@ public class AutomatonTest extends TestCase { Automaton automaton = new Automaton(); WordTree wordTree = new WordTree(); automaton.insert("say", "her", "he", "she", "shr"); - automaton.buildAc(); +// automaton.buildAc(); wordTree.addWords("say", "her", "he", "she", "shr"); StopWatch stopWatch = new StopWatch(); @@ -84,7 +84,7 @@ public class AutomatonTest extends TestCase { stopWatch.start("automaton_char_buid_find"); Automaton automatonLocal = new Automaton(); automatonLocal.insert("say", "her", "he", "she", "shr"); - automatonLocal.buildAc(); +// automatonLocal.buildAc(); List ans1 = automatonLocal.find(input); stopWatch.stop(); assertEquals("she,he,her,say", ans1.stream().map(FoundWord::getWord).collect(Collectors.joining(","))); @@ -118,7 +118,7 @@ public class AutomatonTest extends TestCase { stopWatch.start("automaton_cn_build_find"); Automaton automatonLocal = new Automaton(); automatonLocal.insert("赵", "赵啊", "赵啊三"); - automatonLocal.buildAc(); +// automatonLocal.buildAc(); final List result = automatonLocal.find(input); stopWatch.stop(); @@ -156,7 +156,7 @@ public class AutomatonTest extends TestCase { Automaton automatonLocal = new Automaton(); automatonLocal.insert("赵", "赵啊", "赵啊三"); - automatonLocal.buildAc(); +// automatonLocal.buildAc(); stopWatch.start("automaton_cn_find"); final List result = automatonLocal.find(input); @@ -196,7 +196,7 @@ public class AutomatonTest extends TestCase { Automaton automatonLocal = new Automaton(); automatonLocal.insert("赵", "赵啊", "赵啊三"); - automatonLocal.buildAc(); +// automatonLocal.buildAc(); stopWatch.start("automaton_cn_find_not_density"); final List result = automatonLocal.find(input, false);