mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-05-09 23:51:34 +08:00
Merge pull request #2992 from veteran2018/v6-dev
fix: (基于NFA模型 实现的AC自动机)优化 调用方 需要显示触发API 完成树优化 的问题
This commit is contained in:
commit
f50cb5e8df
@ -4,142 +4,186 @@ import java.util.*;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
*
|
|
||||||
* 基于非确定性有穷自动机(NFA) 实现的多模匹配工具
|
* 基于非确定性有穷自动机(NFA) 实现的多模匹配工具
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
* @author renyp
|
* @author renyp
|
||||||
*/
|
*/
|
||||||
public class NFA {
|
public class NFA {
|
||||||
private final Node root;
|
/**
|
||||||
|
* AC树的根节点
|
||||||
|
*/
|
||||||
|
private final Node root;
|
||||||
|
/**
|
||||||
|
* 标记是否需要构建AC自动机,做树优化
|
||||||
|
*/
|
||||||
|
private volatile boolean needBuildAc;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 默认构造
|
* 内置锁,防止并发场景,并行建AC树,造成不可预知结果
|
||||||
*/
|
*/
|
||||||
public NFA() {
|
private final Object buildAcLock;
|
||||||
this.root = new Node();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 构造函数 并 初始化词库
|
* 内置锁,防止并行插入,新节点建立后,被挂载到树上前 被篡改
|
||||||
*
|
*/
|
||||||
* @param words 添加的新词
|
private final Object insertTreeLock;
|
||||||
*/
|
|
||||||
public NFA(final String... words) {
|
|
||||||
this();
|
|
||||||
this.insert(words);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词库添加新词,初始化查找树
|
* 默认构造
|
||||||
*
|
*/
|
||||||
* @param word 添加的新词
|
public NFA() {
|
||||||
*/
|
this.root = new Node();
|
||||||
public void insert(final String word) {
|
this.needBuildAc = true;
|
||||||
Node p = root;
|
this.buildAcLock = new Object();
|
||||||
for (final char curr : word.toCharArray()) {
|
this.insertTreeLock = new Object();
|
||||||
if (p.next.get((int) curr) == null) {
|
}
|
||||||
p.next.put((int) curr, new Node());
|
|
||||||
}
|
|
||||||
p = p.next.get((int) curr);
|
|
||||||
}
|
|
||||||
p.flag = true;
|
|
||||||
p.str = word;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 词库批量添加新词,初始化查找树
|
* 构造函数 并 初始化词库
|
||||||
*
|
*
|
||||||
* @param words 添加的新词
|
* @param words 添加的新词
|
||||||
*/
|
*/
|
||||||
public void insert(final String... words) {
|
public NFA(String... words) {
|
||||||
for (final String word : words) {
|
this();
|
||||||
this.insert(word);
|
this.insert(words);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 构建基于NFA模型的 AC自动机
|
* 词库添加新词,初始化查找树
|
||||||
*/
|
*
|
||||||
public void buildAc() {
|
* @param word 添加的新词
|
||||||
final Queue<Node> queue = new LinkedList<>();
|
*/
|
||||||
final Node p = root;
|
public void insert(String word) {
|
||||||
for (final Integer key : p.next.keySet()) {
|
synchronized (insertTreeLock) {
|
||||||
p.next.get(key).fail = root;
|
needBuildAc = true;
|
||||||
queue.offer(p.next.get(key));
|
Node p = root;
|
||||||
}
|
for (char curr : word.toCharArray()) {
|
||||||
while (!queue.isEmpty()) {
|
int ind = curr;
|
||||||
final Node curr = queue.poll();
|
if (p.next.get(ind) == null) {
|
||||||
for (final Integer key : curr.next.keySet()) {
|
p.next.put(ind, new Node());
|
||||||
Node fail = curr.fail;
|
}
|
||||||
// 查找当前节点匹配失败,他对应等效匹配的节点是哪个
|
p = p.next.get(ind);
|
||||||
while (fail != null && fail.next.get(key) == null) {
|
}
|
||||||
fail = fail.fail;
|
p.flag = true;
|
||||||
}
|
p.str = word;
|
||||||
// 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配)
|
}
|
||||||
if (fail != null) {
|
}
|
||||||
fail = fail.next.get(key);
|
|
||||||
} else {
|
|
||||||
fail = root;
|
|
||||||
}
|
|
||||||
curr.next.get(key).fail = fail;
|
|
||||||
queue.offer(curr.next.get(key));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param text 查询的文本(母串)
|
* 词库批量添加新词,初始化查找树
|
||||||
* @return 关键字列表
|
*
|
||||||
*/
|
* @param words 添加的新词
|
||||||
public List<FoundWord> find(final String text) {
|
*/
|
||||||
return this.find(text, true);
|
public void insert(String... words) {
|
||||||
}
|
for (String word : words) {
|
||||||
|
this.insert(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param text 查找的文本(母串)
|
* 构建基于NFA模型的 AC自动机
|
||||||
* @param isDensityMatch 是否密集匹配
|
*/
|
||||||
* @return 关键字列表
|
private void buildAc() {
|
||||||
*/
|
Queue<Node> queue = new LinkedList<>();
|
||||||
public List<FoundWord> find(final String text, final boolean isDensityMatch) {
|
Node p = root;
|
||||||
final List<FoundWord> ans = new ArrayList<>();
|
for (Integer key : p.next.keySet()) {
|
||||||
Node p = root, k;
|
p.next.get(key).fail = root;
|
||||||
for (int i = 0, len = text.length(); i < len; i++) {
|
queue.offer(p.next.get(key));
|
||||||
final int ind = text.charAt(i);
|
}
|
||||||
// 状态转移(沿着fail指针链接的链表,此处区别于DFA模型)
|
while (!queue.isEmpty()) {
|
||||||
while (p != null && p.next.get(ind) == null) {
|
Node curr = queue.poll();
|
||||||
p = p.fail;
|
for (Integer key : curr.next.keySet()) {
|
||||||
}
|
Node fail = curr.fail;
|
||||||
if (p == null) {
|
// 查找当前节点匹配失败,他对应等效匹配的节点是哪个
|
||||||
p = root;
|
while (fail != null && fail.next.get(key) == null) {
|
||||||
} else {
|
fail = fail.fail;
|
||||||
p = p.next.get(ind);
|
}
|
||||||
}
|
// 代码到这,有两种可能,fail不为null,说明找到了fail;fail为null,没有找到,那么就把fail指向root节点(当到该节点匹配失败,那么从root节点开始重新匹配)
|
||||||
// 提取结果(沿着fail指针链接的链表,此处区别于DFA模型)
|
if (fail != null) {
|
||||||
k = p;
|
fail = fail.next.get(key);
|
||||||
while (k != null) {
|
} else {
|
||||||
if (k.flag) {
|
fail = root;
|
||||||
ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i));
|
}
|
||||||
if (!isDensityMatch) {
|
curr.next.get(key).fail = fail;
|
||||||
p = root;
|
queue.offer(curr.next.get(key));
|
||||||
break;
|
}
|
||||||
}
|
}
|
||||||
}
|
needBuildAc = false;
|
||||||
k = k.fail;
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class Node {
|
/**
|
||||||
|
* @param text 查询的文本(母串)
|
||||||
|
*/
|
||||||
|
public List<FoundWord> find(String text) {
|
||||||
|
return this.find(text, true);
|
||||||
|
}
|
||||||
|
|
||||||
boolean flag;
|
/**
|
||||||
Node fail;
|
* @param text 查找的文本(母串)
|
||||||
String str;
|
* @param isDensityMatch 是否密集匹配
|
||||||
Map<Integer, Node> next;
|
*/
|
||||||
|
public List<FoundWord> find(String text, boolean isDensityMatch) {
|
||||||
|
// double check,防止重复无用的 buildAC
|
||||||
|
if (needBuildAc) {
|
||||||
|
synchronized (buildAcLock) {
|
||||||
|
if (needBuildAc) {
|
||||||
|
this.buildAc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
List<FoundWord> ans = new ArrayList<>();
|
||||||
|
Node p = root, k = null;
|
||||||
|
for (int i = 0, len = text.length(); i < len; i++) {
|
||||||
|
int ind = text.charAt(i);
|
||||||
|
// 状态转移(沿着fail指针链接的链表,此处区别于DFA模型)
|
||||||
|
while (p != null && p.next.get(ind) == null) {
|
||||||
|
p = p.fail;
|
||||||
|
}
|
||||||
|
if (p == null) {
|
||||||
|
p = root;
|
||||||
|
} else {
|
||||||
|
p = p.next.get(ind);
|
||||||
|
}
|
||||||
|
// 提取结果(沿着fail指针链接的链表,此处区别于DFA模型)
|
||||||
|
k = p;
|
||||||
|
while (k != null) {
|
||||||
|
if (k.flag) {
|
||||||
|
ans.add(new FoundWord(k.str, k.str, i - k.str.length() + 1, i));
|
||||||
|
if (!isDensityMatch) {
|
||||||
|
p = root;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
k = k.fail;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ans;
|
||||||
|
}
|
||||||
|
|
||||||
public Node() {
|
|
||||||
this.flag = false;
|
private static class Node {
|
||||||
next = new HashMap<>();
|
|
||||||
}
|
/**
|
||||||
}
|
* 当前节点是否是一个单词的结尾
|
||||||
|
*/
|
||||||
|
boolean flag;
|
||||||
|
/**
|
||||||
|
* 指向 当前节点匹配失败应该跳转的下个节点
|
||||||
|
*/
|
||||||
|
Node fail;
|
||||||
|
/**
|
||||||
|
* 以当前节点结尾的单词
|
||||||
|
*/
|
||||||
|
String str;
|
||||||
|
/**
|
||||||
|
* 当前节点的子节点
|
||||||
|
*/
|
||||||
|
Map<Integer, Node> next;
|
||||||
|
|
||||||
|
public Node() {
|
||||||
|
this.flag = false;
|
||||||
|
next = new HashMap<>();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,7 @@ public class NFATest {
|
|||||||
public void testFind() {
|
public void testFind() {
|
||||||
final NFA NFA = new NFA();
|
final NFA NFA = new NFA();
|
||||||
NFA.insert("say", "her", "he", "she", "shr");
|
NFA.insert("say", "her", "he", "she", "shr");
|
||||||
NFA.buildAc();
|
// NFA.buildAc();
|
||||||
|
|
||||||
final WordTree wordTree = new WordTree();
|
final WordTree wordTree = new WordTree();
|
||||||
wordTree.addWords("say", "her", "he", "she", "shr");
|
wordTree.addWords("say", "her", "he", "she", "shr");
|
||||||
@ -53,7 +53,7 @@ public class NFATest {
|
|||||||
public void testFindNotDensity() {
|
public void testFindNotDensity() {
|
||||||
final NFA NFA = new NFA();
|
final NFA NFA = new NFA();
|
||||||
NFA.insert("say", "her", "he", "she", "shr");
|
NFA.insert("say", "her", "he", "she", "shr");
|
||||||
NFA.buildAc();
|
// NFA.buildAc();
|
||||||
|
|
||||||
final WordTree wordTree = new WordTree();
|
final WordTree wordTree = new WordTree();
|
||||||
wordTree.addWords("say", "her", "he", "she", "shr");
|
wordTree.addWords("say", "her", "he", "she", "shr");
|
||||||
@ -89,7 +89,7 @@ public class NFATest {
|
|||||||
stopWatch.start("automaton_char_buid_find");
|
stopWatch.start("automaton_char_buid_find");
|
||||||
final NFA NFALocal = new NFA();
|
final NFA NFALocal = new NFA();
|
||||||
NFALocal.insert("say", "her", "he", "she", "shr");
|
NFALocal.insert("say", "her", "he", "she", "shr");
|
||||||
NFALocal.buildAc();
|
// NFALocal.buildAc();
|
||||||
final List<FoundWord> ans1 = NFALocal.find(input);
|
final List<FoundWord> ans1 = NFALocal.find(input);
|
||||||
stopWatch.stop();
|
stopWatch.stop();
|
||||||
|
|
||||||
@ -124,7 +124,7 @@ public class NFATest {
|
|||||||
stopWatch.start("automaton_cn_build_find");
|
stopWatch.start("automaton_cn_build_find");
|
||||||
final NFA NFALocal = new NFA();
|
final NFA NFALocal = new NFA();
|
||||||
NFALocal.insert("赵", "赵啊", "赵啊三");
|
NFALocal.insert("赵", "赵啊", "赵啊三");
|
||||||
NFALocal.buildAc();
|
// NFALocal.buildAc();
|
||||||
|
|
||||||
final List<FoundWord> result = NFALocal.find(input);
|
final List<FoundWord> result = NFALocal.find(input);
|
||||||
stopWatch.stop();
|
stopWatch.stop();
|
||||||
@ -161,7 +161,7 @@ public class NFATest {
|
|||||||
|
|
||||||
final NFA NFALocal = new NFA();
|
final NFA NFALocal = new NFA();
|
||||||
NFALocal.insert("赵", "赵啊", "赵啊三");
|
NFALocal.insert("赵", "赵啊", "赵啊三");
|
||||||
NFALocal.buildAc();
|
// NFALocal.buildAc();
|
||||||
|
|
||||||
stopWatch.start("automaton_cn_find");
|
stopWatch.start("automaton_cn_find");
|
||||||
final List<FoundWord> result = NFALocal.find(input);
|
final List<FoundWord> result = NFALocal.find(input);
|
||||||
@ -200,7 +200,7 @@ public class NFATest {
|
|||||||
|
|
||||||
final NFA NFALocal = new NFA();
|
final NFA NFALocal = new NFA();
|
||||||
NFALocal.insert("赵", "赵啊", "赵啊三");
|
NFALocal.insert("赵", "赵啊", "赵啊三");
|
||||||
NFALocal.buildAc();
|
// NFALocal.buildAc();
|
||||||
|
|
||||||
stopWatch.start("automaton_cn_find_not_density");
|
stopWatch.start("automaton_cn_find_not_density");
|
||||||
final List<FoundWord> result = NFALocal.find(input, false);
|
final List<FoundWord> result = NFALocal.find(input, false);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user