Merge pull request #2992 from veteran2018/v6-dev

fix: (基于NFA模型 实现的AC自动机)优化 调用方 需要显示触发API 完成树优化 的问题
This commit is contained in:
Golden Looly 2023-03-17 10:22:13 +08:00 committed by GitHub
commit f50cb5e8df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 173 additions and 129 deletions

View File

@ -4,19 +4,39 @@ import java.util.*;
/** /**
* <p> * <p>
*
* 基于非确定性有穷自动机NFA 实现的多模匹配工具 * 基于非确定性有穷自动机NFA 实现的多模匹配工具
* </p>
* *
* @author renyp * @author renyp
*/ */
public class NFA { public class NFA {
/**
* AC树的根节点
*/
private final Node root; private final Node root;
/**
* 标记是否需要构建AC自动机做树优化
*/
private volatile boolean needBuildAc;
/**
* 内置锁防止并发场景并行建AC树造成不可预知结果
*/
private final Object buildAcLock;
/**
* 内置锁防止并行插入新节点建立后被挂载到树上前 被篡改
*/
private final Object insertTreeLock;
/** /**
* 默认构造 * 默认构造
*/ */
public NFA() { public NFA() {
this.root = new Node(); this.root = new Node();
this.needBuildAc = true;
this.buildAcLock = new Object();
this.insertTreeLock = new Object();
} }
/** /**
@ -24,7 +44,7 @@ public class NFA {
* *
* @param words 添加的新词 * @param words 添加的新词
*/ */
public NFA(final String... words) { public NFA(String... words) {
this(); this();
this.insert(words); this.insert(words);
} }
@ -34,25 +54,29 @@ public class NFA {
* *
* @param word 添加的新词 * @param word 添加的新词
*/ */
public void insert(final String word) { public void insert(String word) {
synchronized (insertTreeLock) {
needBuildAc = true;
Node p = root; Node p = root;
for (final char curr : word.toCharArray()) { for (char curr : word.toCharArray()) {
if (p.next.get((int) curr) == null) { int ind = curr;
p.next.put((int) curr, new Node()); if (p.next.get(ind) == null) {
p.next.put(ind, new Node());
} }
p = p.next.get((int) curr); p = p.next.get(ind);
} }
p.flag = true; p.flag = true;
p.str = word; p.str = word;
} }
}
/** /**
* 词库批量添加新词初始化查找树 * 词库批量添加新词初始化查找树
* *
* @param words 添加的新词 * @param words 添加的新词
*/ */
public void insert(final String... words) { public void insert(String... words) {
for (final String word : words) { for (String word : words) {
this.insert(word); this.insert(word);
} }
} }
@ -60,16 +84,16 @@ public class NFA {
/** /**
* 构建基于NFA模型的 AC自动机 * 构建基于NFA模型的 AC自动机
*/ */
public void buildAc() { private void buildAc() {
final Queue<Node> queue = new LinkedList<>(); Queue<Node> queue = new LinkedList<>();
final Node p = root; Node p = root;
for (final Integer key : p.next.keySet()) { for (Integer key : p.next.keySet()) {
p.next.get(key).fail = root; p.next.get(key).fail = root;
queue.offer(p.next.get(key)); queue.offer(p.next.get(key));
} }
while (!queue.isEmpty()) { while (!queue.isEmpty()) {
final Node curr = queue.poll(); Node curr = queue.poll();
for (final Integer key : curr.next.keySet()) { for (Integer key : curr.next.keySet()) {
Node fail = curr.fail; Node fail = curr.fail;
// 查找当前节点匹配失败他对应等效匹配的节点是哪个 // 查找当前节点匹配失败他对应等效匹配的节点是哪个
while (fail != null && fail.next.get(key) == null) { while (fail != null && fail.next.get(key) == null) {
@ -85,26 +109,33 @@ public class NFA {
queue.offer(curr.next.get(key)); queue.offer(curr.next.get(key));
} }
} }
needBuildAc = false;
} }
/** /**
* @param text 查询的文本母串 * @param text 查询的文本母串
* @return 关键字列表
*/ */
public List<FoundWord> find(final String text) { public List<FoundWord> find(String text) {
return this.find(text, true); return this.find(text, true);
} }
/** /**
* @param text 查找的文本母串 * @param text 查找的文本母串
* @param isDensityMatch 是否密集匹配 * @param isDensityMatch 是否密集匹配
* @return 关键字列表
*/ */
public List<FoundWord> find(final String text, final boolean isDensityMatch) { public List<FoundWord> find(String text, boolean isDensityMatch) {
final List<FoundWord> ans = new ArrayList<>(); // double check防止重复无用的 buildAC
Node p = root, k; if (needBuildAc) {
synchronized (buildAcLock) {
if (needBuildAc) {
this.buildAc();
}
}
}
List<FoundWord> ans = new ArrayList<>();
Node p = root, k = null;
for (int i = 0, len = text.length(); i < len; i++) { for (int i = 0, len = text.length(); i < len; i++) {
final int ind = text.charAt(i); int ind = text.charAt(i);
// 状态转移(沿着fail指针链接的链表此处区别于DFA模型) // 状态转移(沿着fail指针链接的链表此处区别于DFA模型)
while (p != null && p.next.get(ind) == null) { while (p != null && p.next.get(ind) == null) {
p = p.fail; p = p.fail;
@ -130,11 +161,24 @@ public class NFA {
return ans; return ans;
} }
private static class Node { private static class Node {
/**
* 当前节点是否是一个单词的结尾
*/
boolean flag; boolean flag;
/**
* 指向 当前节点匹配失败应该跳转的下个节点
*/
Node fail; Node fail;
/**
* 以当前节点结尾的单词
*/
String str; String str;
/**
* 当前节点的子节点
*/
Map<Integer, Node> next; Map<Integer, Node> next;
public Node() { public Node() {

View File

@ -16,7 +16,7 @@ public class NFATest {
public void testFind() { public void testFind() {
final NFA NFA = new NFA(); final NFA NFA = new NFA();
NFA.insert("say", "her", "he", "she", "shr"); NFA.insert("say", "her", "he", "she", "shr");
NFA.buildAc(); // NFA.buildAc();
final WordTree wordTree = new WordTree(); final WordTree wordTree = new WordTree();
wordTree.addWords("say", "her", "he", "she", "shr"); wordTree.addWords("say", "her", "he", "she", "shr");
@ -53,7 +53,7 @@ public class NFATest {
public void testFindNotDensity() { public void testFindNotDensity() {
final NFA NFA = new NFA(); final NFA NFA = new NFA();
NFA.insert("say", "her", "he", "she", "shr"); NFA.insert("say", "her", "he", "she", "shr");
NFA.buildAc(); // NFA.buildAc();
final WordTree wordTree = new WordTree(); final WordTree wordTree = new WordTree();
wordTree.addWords("say", "her", "he", "she", "shr"); wordTree.addWords("say", "her", "he", "she", "shr");
@ -89,7 +89,7 @@ public class NFATest {
stopWatch.start("automaton_char_buid_find"); stopWatch.start("automaton_char_buid_find");
final NFA NFALocal = new NFA(); final NFA NFALocal = new NFA();
NFALocal.insert("say", "her", "he", "she", "shr"); NFALocal.insert("say", "her", "he", "she", "shr");
NFALocal.buildAc(); // NFALocal.buildAc();
final List<FoundWord> ans1 = NFALocal.find(input); final List<FoundWord> ans1 = NFALocal.find(input);
stopWatch.stop(); stopWatch.stop();
@ -124,7 +124,7 @@ public class NFATest {
stopWatch.start("automaton_cn_build_find"); stopWatch.start("automaton_cn_build_find");
final NFA NFALocal = new NFA(); final NFA NFALocal = new NFA();
NFALocal.insert("", "赵啊", "赵啊三"); NFALocal.insert("", "赵啊", "赵啊三");
NFALocal.buildAc(); // NFALocal.buildAc();
final List<FoundWord> result = NFALocal.find(input); final List<FoundWord> result = NFALocal.find(input);
stopWatch.stop(); stopWatch.stop();
@ -161,7 +161,7 @@ public class NFATest {
final NFA NFALocal = new NFA(); final NFA NFALocal = new NFA();
NFALocal.insert("", "赵啊", "赵啊三"); NFALocal.insert("", "赵啊", "赵啊三");
NFALocal.buildAc(); // NFALocal.buildAc();
stopWatch.start("automaton_cn_find"); stopWatch.start("automaton_cn_find");
final List<FoundWord> result = NFALocal.find(input); final List<FoundWord> result = NFALocal.find(input);
@ -200,7 +200,7 @@ public class NFATest {
final NFA NFALocal = new NFA(); final NFA NFALocal = new NFA();
NFALocal.insert("", "赵啊", "赵啊三"); NFALocal.insert("", "赵啊", "赵啊三");
NFALocal.buildAc(); // NFALocal.buildAc();
stopWatch.start("automaton_cn_find_not_density"); stopWatch.start("automaton_cn_find_not_density");
final List<FoundWord> result = NFALocal.find(input, false); final List<FoundWord> result = NFALocal.find(input, false);