修改DFA逻辑,贪婪匹配优先长匹配而舍弃短匹配

This commit is contained in:
Looly 2023-12-06 00:49:12 +08:00
parent 73152c5361
commit 0697bc1457
2 changed files with 76 additions and 42 deletions

View File

@ -14,15 +14,11 @@ package org.dromara.hutool.core.text.dfa;
import org.dromara.hutool.core.collection.CollUtil;
import org.dromara.hutool.core.collection.set.SetUtil;
import org.dromara.hutool.core.lang.Console;
import org.dromara.hutool.core.map.MapUtil;
import org.dromara.hutool.core.text.StrUtil;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.function.Predicate;
/**
@ -251,8 +247,8 @@ public class WordTree extends HashMap<Character, WordTree> {
/**
* 找出所有匹配的关键字<br>
* <p>假如被检查文本是{@literal "abab"}<br>
* 密集匹配原则假如关键词有 ab,b将匹配 [ab,b,ab]<br>
* 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[a, ab]
* 密集匹配原则假如关键词有 ab,b将匹配 [ab,b,ab,b]<br>
* 贪婪匹配最长匹配原则假如关键字a,ab最长匹配将匹配[ab]
* </p>
*
* @param text 被检查的文本
@ -279,6 +275,8 @@ public class WordTree extends HashMap<Character, WordTree> {
current = this;
wordBuffer.setLength(0);
keyBuffer.setLength(0);
FoundWord currentFoundWord = null;
for (int j = i; j < length; j++) {
currentChar = text.charAt(j);
if (!charFilter.test(currentChar)) {
@ -291,31 +289,34 @@ public class WordTree extends HashMap<Character, WordTree> {
}
continue;
} else if (!current.containsKey(currentChar)) {
//非关键字符被整体略过重新以下个字符开始检查
// 节点不匹配开始下一轮
break;
}
wordBuffer.append(currentChar);
keyBuffer.append(currentChar);
if (current.isEnd(currentChar)) {
//到达单词末尾关键词成立从此词的下一个位置开始查找
foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j));
if (limit > 0 && foundWords.size() >= limit) {
//超过匹配限制个数直接返回
return foundWords;
}
currentFoundWord = new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j);
//如果非密度匹配跳过匹配到的词
if (!isDensityMatch) {
//如果非密度匹配跳过匹配到的词
i = j;
break;
}
//如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配
if (!isGreedMatch) {
//如果懒惰匹配非贪婪匹配当遇到第一个结尾标记就结束本轮匹配
break;
}
}
// 查找下一个节点节点始终不会为null因为当前阶段或匹配结束或匹配不到结束
current = current.get(currentChar);
if (null == current) {
break;
}
// 本次循环结尾加入遗留匹配的单词
if(null != currentFoundWord){
foundWords.add(currentFoundWord);
if (limit > 0 && foundWords.size() >= limit) {
//超过匹配限制个数直接返回
return foundWords;
}
}
}

View File

@ -22,7 +22,6 @@ import java.util.List;
* DFA单元测试
*
* @author Looly
*
*/
public class DfaTest {
@ -59,7 +58,9 @@ public class DfaTest {
}
/**
* 贪婪非密集匹配原则测试
* 贪婪非密集匹配原则测试<br>
* 贪婪最长匹配
* 非密集跳过匹配到的
*/
@Test
public void greedMatchTest() {
@ -68,15 +69,16 @@ public class DfaTest {
// -----------------------------------------------------------------------------------------------------------------------------------
// 情况三匹配到最长关键词跳过已经匹配的关键词
// 匹配到由于非密集匹配因此从下一个字符开始查找匹配到土豆接着被匹配
// 匹配到大土豆最长匹配则保留大土豆非密集匹配土豆跳过
// 由于刚出锅被匹配由于非密集匹配出锅被跳过
final List<String> matchAll = tree.matchAll(text, -1, false, true);
Assertions.assertEquals(matchAll, ListUtil.of("", "土^豆", "刚出锅"));
Assertions.assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll);
}
/**
* 密集匹配原则最长匹配和贪婪匹配原则测试
* 贪婪最长匹配
* 密集不跳过匹配到的
*/
@Test
public void densityAndGreedMatchTest() {
@ -85,34 +87,29 @@ public class DfaTest {
// -----------------------------------------------------------------------------------------------------------------------------------
// 情况四匹配到最长关键词不跳过已经匹配的关键词最全关键词
// 匹配到由于到最长匹配因此大土豆接着被匹配由于不跳过已经匹配的关键词土豆继续被匹配
// 匹配到大土豆由于到最长匹配因此大土豆保留由于不跳过已经匹配的关键词土豆继续被匹配
// 刚出锅被匹配由于不跳过已经匹配的词出锅被匹配
final List<String> matchAll = tree.matchAll(text, -1, true, true);
Assertions.assertEquals(matchAll, ListUtil.of("", "大土^豆", "土^豆", "刚出锅", "出锅"));
Assertions.assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll);
}
/**
* 由于贪婪匹配因此赵阿都被跳过只保留最长的赵阿三
*/
@Test
public void densityAndGreedMatchTest2(){
public void densityAndGreedMatchTest2() {
final WordTree tree = new WordTree();
tree.addWord("");
tree.addWord("赵阿");
tree.addWord("赵阿三");
final List<FoundWord> result = tree.matchAllWords("赵阿三在做什么", -1, true, true);
Assertions.assertEquals(3, result.size());
Assertions.assertEquals(1, result.size());
Assertions.assertEquals("", result.get(0).getWord());
Assertions.assertEquals("阿三", result.get(0).getWord());
Assertions.assertEquals(0, result.get(0).getBeginIndex().intValue());
Assertions.assertEquals(0, result.get(0).getEndIndex().intValue());
Assertions.assertEquals("赵阿", result.get(1).getWord());
Assertions.assertEquals(0, result.get(1).getBeginIndex().intValue());
Assertions.assertEquals(1, result.get(1).getEndIndex().intValue());
Assertions.assertEquals("赵阿三", result.get(2).getWord());
Assertions.assertEquals(0, result.get(2).getBeginIndex().intValue());
Assertions.assertEquals(2, result.get(2).getEndIndex().intValue());
Assertions.assertEquals(2, result.get(0).getEndIndex().intValue());
}
/**
@ -128,7 +125,7 @@ public class DfaTest {
}
@Test
public void aTest(){
public void aTest() {
final WordTree tree = new WordTree();
tree.addWord("women");
final String text = "a WOMEN todo.".toLowerCase();
@ -137,13 +134,13 @@ public class DfaTest {
}
@Test
public void clearTest(){
public void clearTest() {
WordTree tree = new WordTree();
tree.addWord("");
Assertions.assertTrue(tree.matchAll("黑大衣").contains(""));
//clear时直接调用Map的clear并没有把endCharacterSet清理掉
tree.clear();
tree.addWords("黑大衣","红色大衣");
tree.addWords("黑大衣", "红色大衣");
//clear() 覆写前 这里想匹配到黑大衣但是却匹配到了黑
// Assertions.assertFalse(tree.matchAll("黑大衣").contains("黑大衣"));
@ -155,12 +152,13 @@ public class DfaTest {
//如果不覆写只能通过new出新对象才不会有问题
tree = new WordTree();
tree.addWords("黑大衣","红色大衣");
tree.addWords("黑大衣", "红色大衣");
Assertions.assertTrue(tree.matchAll("黑大衣").contains("黑大衣"));
Assertions.assertTrue(tree.matchAll("红色大衣").contains("红色大衣"));
}
// ----------------------------------------------------------------------------------------------------------
/**
* 构建查找树
*
@ -176,4 +174,39 @@ public class DfaTest {
tree.addWord("出锅");
return tree;
}
@Test
void issueI8LAEWTest() {
final WordTree wordTree = new WordTree();
wordTree.addWords("UserServiceImpl", "UserService");
final String text = "This is test Service: UserServiceImpl UserServiceTest...";
final List<String> strings = wordTree.matchAll(text, -1, false, true);
Assertions.assertEquals("[UserServiceImpl, UserService]", strings.toString());
}
/**
* 此测试验证边界问题当最后一个字符匹配时的问题
*/
@Test
void matchAbTest() {
final WordTree wordTree = new WordTree();
wordTree.addWords("ab", "b");
// 非密集非贪婪
List<String> strings = wordTree.matchAll("abab", -1, false, false);
Assertions.assertEquals("[ab, ab]", strings.toString());
// 密集非贪婪
strings = wordTree.matchAll("abab", -1, true, false);
Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
// 非密集贪婪
strings = wordTree.matchAll("abab", -1, false, true);
Assertions.assertEquals("[ab, ab]", strings.toString());
// 密集贪婪
strings = wordTree.matchAll("abab", -1, true, true);
Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
}
}