mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-04-19 03:01:48 +08:00
修改DFA逻辑,贪婪匹配优先长匹配而舍弃短匹配
This commit is contained in:
parent
73152c5361
commit
0697bc1457
@ -14,15 +14,11 @@ package org.dromara.hutool.core.text.dfa;
|
||||
|
||||
import org.dromara.hutool.core.collection.CollUtil;
|
||||
import org.dromara.hutool.core.collection.set.SetUtil;
|
||||
import org.dromara.hutool.core.lang.Console;
|
||||
import org.dromara.hutool.core.map.MapUtil;
|
||||
import org.dromara.hutool.core.text.StrUtil;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/**
|
||||
@ -251,8 +247,8 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
/**
|
||||
* 找出所有匹配的关键字<br>
|
||||
* <p>假如被检查文本是{@literal "abab"}<br>
|
||||
* 密集匹配原则:假如关键词有 ab,b,将匹配 [ab,b,ab]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab]
|
||||
* 密集匹配原则:假如关键词有 ab,b,将匹配 [ab,b,ab,b]<br>
|
||||
* 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[ab]
|
||||
* </p>
|
||||
*
|
||||
* @param text 被检查的文本
|
||||
@ -279,6 +275,8 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
current = this;
|
||||
wordBuffer.setLength(0);
|
||||
keyBuffer.setLength(0);
|
||||
|
||||
FoundWord currentFoundWord = null;
|
||||
for (int j = i; j < length; j++) {
|
||||
currentChar = text.charAt(j);
|
||||
if (!charFilter.test(currentChar)) {
|
||||
@ -291,31 +289,34 @@ public class WordTree extends HashMap<Character, WordTree> {
|
||||
}
|
||||
continue;
|
||||
} else if (!current.containsKey(currentChar)) {
|
||||
//非关键字符被整体略过,重新以下个字符开始检查
|
||||
// 节点不匹配,开始下一轮
|
||||
break;
|
||||
}
|
||||
wordBuffer.append(currentChar);
|
||||
keyBuffer.append(currentChar);
|
||||
if (current.isEnd(currentChar)) {
|
||||
//到达单词末尾,关键词成立,从此词的下一个位置开始查找
|
||||
foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j));
|
||||
if (limit > 0 && foundWords.size() >= limit) {
|
||||
//超过匹配限制个数,直接返回
|
||||
return foundWords;
|
||||
}
|
||||
currentFoundWord = new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j);
|
||||
//如果非密度匹配,跳过匹配到的词
|
||||
if (!isDensityMatch) {
|
||||
//如果非密度匹配,跳过匹配到的词
|
||||
i = j;
|
||||
break;
|
||||
}
|
||||
|
||||
//如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配
|
||||
if (!isGreedMatch) {
|
||||
//如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配
|
||||
break;
|
||||
}
|
||||
}
|
||||
// 查找下一个节点,节点始终不会为null,因为当前阶段或匹配结束,或匹配不到结束
|
||||
current = current.get(currentChar);
|
||||
if (null == current) {
|
||||
break;
|
||||
}
|
||||
|
||||
// 本次循环结尾,加入遗留匹配的单词
|
||||
if(null != currentFoundWord){
|
||||
foundWords.add(currentFoundWord);
|
||||
if (limit > 0 && foundWords.size() >= limit) {
|
||||
//超过匹配限制个数,直接返回
|
||||
return foundWords;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,6 @@ import java.util.List;
|
||||
* DFA单元测试
|
||||
*
|
||||
* @author Looly
|
||||
*
|
||||
*/
|
||||
public class DfaTest {
|
||||
|
||||
@ -59,7 +58,9 @@ public class DfaTest {
|
||||
}
|
||||
|
||||
/**
|
||||
* 贪婪非密集匹配原则测试
|
||||
* 贪婪非密集匹配原则测试<br>
|
||||
* 贪婪:最长匹配
|
||||
* 非密集:跳过匹配到的
|
||||
*/
|
||||
@Test
|
||||
public void greedMatchTest() {
|
||||
@ -68,15 +69,16 @@ public class DfaTest {
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------------------------
|
||||
// 情况三:匹配到最长关键词,跳过已经匹配的关键词
|
||||
// 匹配到【大】,由于非密集匹配,因此从下一个字符开始查找,匹配到【土豆】接着被匹配
|
||||
// 匹配到【大】和【大土豆】,最长匹配则保留【大土豆】,非密集匹配,【土豆】跳过。
|
||||
// 由于【刚出锅】被匹配,由于非密集匹配,【出锅】被跳过
|
||||
final List<String> matchAll = tree.matchAll(text, -1, false, true);
|
||||
Assertions.assertEquals(matchAll, ListUtil.of("大", "土^豆", "刚出锅"));
|
||||
|
||||
Assertions.assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll);
|
||||
}
|
||||
|
||||
/**
|
||||
* 密集匹配原则(最长匹配)和贪婪匹配原则测试
|
||||
* 贪婪:最长匹配
|
||||
* 密集:不跳过匹配到的
|
||||
*/
|
||||
@Test
|
||||
public void densityAndGreedMatchTest() {
|
||||
@ -85,34 +87,29 @@ public class DfaTest {
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------------------------
|
||||
// 情况四:匹配到最长关键词,不跳过已经匹配的关键词(最全关键词)
|
||||
// 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配
|
||||
// 匹配到【大】和【大土豆】,由于到最长匹配,因此【大土豆】保留,由于不跳过已经匹配的关键词,【土豆】继续被匹配
|
||||
// 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配
|
||||
final List<String> matchAll = tree.matchAll(text, -1, true, true);
|
||||
Assertions.assertEquals(matchAll, ListUtil.of("大", "大土^豆", "土^豆", "刚出锅", "出锅"));
|
||||
Assertions.assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 由于贪婪匹配,因此【赵】、【赵阿】都被跳过,只保留最长的【赵阿三】
|
||||
*/
|
||||
@Test
|
||||
public void densityAndGreedMatchTest2(){
|
||||
public void densityAndGreedMatchTest2() {
|
||||
final WordTree tree = new WordTree();
|
||||
tree.addWord("赵");
|
||||
tree.addWord("赵阿");
|
||||
tree.addWord("赵阿三");
|
||||
|
||||
final List<FoundWord> result = tree.matchAllWords("赵阿三在做什么", -1, true, true);
|
||||
Assertions.assertEquals(3, result.size());
|
||||
Assertions.assertEquals(1, result.size());
|
||||
|
||||
Assertions.assertEquals("赵", result.get(0).getWord());
|
||||
Assertions.assertEquals("赵阿三", result.get(0).getWord());
|
||||
Assertions.assertEquals(0, result.get(0).getBeginIndex().intValue());
|
||||
Assertions.assertEquals(0, result.get(0).getEndIndex().intValue());
|
||||
|
||||
Assertions.assertEquals("赵阿", result.get(1).getWord());
|
||||
Assertions.assertEquals(0, result.get(1).getBeginIndex().intValue());
|
||||
Assertions.assertEquals(1, result.get(1).getEndIndex().intValue());
|
||||
|
||||
Assertions.assertEquals("赵阿三", result.get(2).getWord());
|
||||
Assertions.assertEquals(0, result.get(2).getBeginIndex().intValue());
|
||||
Assertions.assertEquals(2, result.get(2).getEndIndex().intValue());
|
||||
Assertions.assertEquals(2, result.get(0).getEndIndex().intValue());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -128,7 +125,7 @@ public class DfaTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void aTest(){
|
||||
public void aTest() {
|
||||
final WordTree tree = new WordTree();
|
||||
tree.addWord("women");
|
||||
final String text = "a WOMEN todo.".toLowerCase();
|
||||
@ -137,13 +134,13 @@ public class DfaTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void clearTest(){
|
||||
public void clearTest() {
|
||||
WordTree tree = new WordTree();
|
||||
tree.addWord("黑");
|
||||
Assertions.assertTrue(tree.matchAll("黑大衣").contains("黑"));
|
||||
//clear时直接调用Map的clear并没有把endCharacterSet清理掉
|
||||
tree.clear();
|
||||
tree.addWords("黑大衣","红色大衣");
|
||||
tree.addWords("黑大衣", "红色大衣");
|
||||
|
||||
//clear() 覆写前 这里想匹配到黑大衣,但是却匹配到了黑
|
||||
// Assertions.assertFalse(tree.matchAll("黑大衣").contains("黑大衣"));
|
||||
@ -155,12 +152,13 @@ public class DfaTest {
|
||||
|
||||
//如果不覆写只能通过new出新对象才不会有问题
|
||||
tree = new WordTree();
|
||||
tree.addWords("黑大衣","红色大衣");
|
||||
tree.addWords("黑大衣", "红色大衣");
|
||||
Assertions.assertTrue(tree.matchAll("黑大衣").contains("黑大衣"));
|
||||
Assertions.assertTrue(tree.matchAll("红色大衣").contains("红色大衣"));
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* 构建查找树
|
||||
*
|
||||
@ -176,4 +174,39 @@ public class DfaTest {
|
||||
tree.addWord("出锅");
|
||||
return tree;
|
||||
}
|
||||
|
||||
@Test
|
||||
void issueI8LAEWTest() {
|
||||
final WordTree wordTree = new WordTree();
|
||||
wordTree.addWords("UserServiceImpl", "UserService");
|
||||
|
||||
final String text = "This is test Service: UserServiceImpl UserServiceTest...";
|
||||
final List<String> strings = wordTree.matchAll(text, -1, false, true);
|
||||
Assertions.assertEquals("[UserServiceImpl, UserService]", strings.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* 此测试验证边界问题,当最后一个字符匹配时的问题
|
||||
*/
|
||||
@Test
|
||||
void matchAbTest() {
|
||||
final WordTree wordTree = new WordTree();
|
||||
wordTree.addWords("ab", "b");
|
||||
|
||||
// 非密集,非贪婪
|
||||
List<String> strings = wordTree.matchAll("abab", -1, false, false);
|
||||
Assertions.assertEquals("[ab, ab]", strings.toString());
|
||||
|
||||
// 密集,非贪婪
|
||||
strings = wordTree.matchAll("abab", -1, true, false);
|
||||
Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
|
||||
|
||||
// 非密集,贪婪
|
||||
strings = wordTree.matchAll("abab", -1, false, true);
|
||||
Assertions.assertEquals("[ab, ab]", strings.toString());
|
||||
|
||||
// 密集,贪婪
|
||||
strings = wordTree.matchAll("abab", -1, true, true);
|
||||
Assertions.assertEquals("[ab, b, ab, b]", strings.toString());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user