diff --git a/hutool-core/src/main/java/org/dromara/hutool/core/text/dfa/WordTree.java b/hutool-core/src/main/java/org/dromara/hutool/core/text/dfa/WordTree.java index 82f708db0..994033c45 100644 --- a/hutool-core/src/main/java/org/dromara/hutool/core/text/dfa/WordTree.java +++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/dfa/WordTree.java @@ -14,15 +14,11 @@ package org.dromara.hutool.core.text.dfa; import org.dromara.hutool.core.collection.CollUtil; import org.dromara.hutool.core.collection.set.SetUtil; +import org.dromara.hutool.core.lang.Console; import org.dromara.hutool.core.map.MapUtil; import org.dromara.hutool.core.text.StrUtil; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.function.Predicate; /** @@ -251,8 +247,8 @@ public class WordTree extends HashMap { /** * 找出所有匹配的关键字
*

假如被检查文本是{@literal "abab"}
- * 密集匹配原则:假如关键词有 ab,b,将匹配 [ab,b,ab]
- * 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[a, ab] + * 密集匹配原则:假如关键词有 ab,b,将匹配 [ab,b,ab,b]
+ * 贪婪匹配(最长匹配)原则:假如关键字a,ab,最长匹配将匹配[ab] *

* * @param text 被检查的文本 @@ -279,6 +275,8 @@ public class WordTree extends HashMap { current = this; wordBuffer.setLength(0); keyBuffer.setLength(0); + + FoundWord currentFoundWord = null; for (int j = i; j < length; j++) { currentChar = text.charAt(j); if (!charFilter.test(currentChar)) { @@ -291,31 +289,34 @@ public class WordTree extends HashMap { } continue; } else if (!current.containsKey(currentChar)) { - //非关键字符被整体略过,重新以下个字符开始检查 + // 节点不匹配,开始下一轮 break; } wordBuffer.append(currentChar); keyBuffer.append(currentChar); if (current.isEnd(currentChar)) { //到达单词末尾,关键词成立,从此词的下一个位置开始查找 - foundWords.add(new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j)); - if (limit > 0 && foundWords.size() >= limit) { - //超过匹配限制个数,直接返回 - return foundWords; - } + currentFoundWord = new FoundWord(keyBuffer.toString(), wordBuffer.toString(), i, j); + //如果非密度匹配,跳过匹配到的词 if (!isDensityMatch) { - //如果非密度匹配,跳过匹配到的词 i = j; - break; } + + //如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配 if (!isGreedMatch) { - //如果懒惰匹配(非贪婪匹配)。当遇到第一个结尾标记就结束本轮匹配 break; } } + // 查找下一个节点,节点始终不会为null,因为当前阶段或匹配结束,或匹配不到结束 current = current.get(currentChar); - if (null == current) { - break; + } + + // 本次循环结尾,加入遗留匹配的单词 + if(null != currentFoundWord){ + foundWords.add(currentFoundWord); + if (limit > 0 && foundWords.size() >= limit) { + //超过匹配限制个数,直接返回 + return foundWords; } } } diff --git a/hutool-core/src/test/java/org/dromara/hutool/core/text/dfa/DfaTest.java b/hutool-core/src/test/java/org/dromara/hutool/core/text/dfa/DfaTest.java index c7b0eaacf..ff1c1f30b 100644 --- a/hutool-core/src/test/java/org/dromara/hutool/core/text/dfa/DfaTest.java +++ b/hutool-core/src/test/java/org/dromara/hutool/core/text/dfa/DfaTest.java @@ -22,7 +22,6 @@ import java.util.List; * DFA单元测试 * * @author Looly - * */ public class DfaTest { @@ -59,7 +58,9 @@ public class DfaTest { } /** - * 贪婪非密集匹配原则测试 + * 贪婪非密集匹配原则测试
+ * 贪婪:最长匹配 + * 非密集:跳过匹配到的 */ @Test public void greedMatchTest() { @@ -68,15 +69,16 @@ public class DfaTest { // ----------------------------------------------------------------------------------------------------------------------------------- // 情况三:匹配到最长关键词,跳过已经匹配的关键词 - // 匹配到【大】,由于非密集匹配,因此从下一个字符开始查找,匹配到【土豆】接着被匹配 + // 匹配到【大】和【大土豆】,最长匹配则保留【大土豆】,非密集匹配,【土豆】跳过。 // 由于【刚出锅】被匹配,由于非密集匹配,【出锅】被跳过 final List matchAll = tree.matchAll(text, -1, false, true); - Assertions.assertEquals(matchAll, ListUtil.of("大", "土^豆", "刚出锅")); - + Assertions.assertEquals(ListUtil.of("大土^豆", "刚出锅"), matchAll); } /** * 密集匹配原则(最长匹配)和贪婪匹配原则测试 + * 贪婪:最长匹配 + * 密集:不跳过匹配到的 */ @Test public void densityAndGreedMatchTest() { @@ -85,34 +87,29 @@ public class DfaTest { // ----------------------------------------------------------------------------------------------------------------------------------- // 情况四:匹配到最长关键词,不跳过已经匹配的关键词(最全关键词) - // 匹配到【大】,由于到最长匹配,因此【大土豆】接着被匹配,由于不跳过已经匹配的关键词,土豆继续被匹配 + // 匹配到【大】和【大土豆】,由于到最长匹配,因此【大土豆】保留,由于不跳过已经匹配的关键词,【土豆】继续被匹配 // 【刚出锅】被匹配,由于不跳过已经匹配的词,【出锅】被匹配 final List matchAll = tree.matchAll(text, -1, true, true); - Assertions.assertEquals(matchAll, ListUtil.of("大", "大土^豆", "土^豆", "刚出锅", "出锅")); + Assertions.assertEquals(ListUtil.of("大土^豆", "土^豆", "刚出锅", "出锅"), matchAll); } + /** + * 由于贪婪匹配,因此【赵】、【赵阿】都被跳过,只保留最长的【赵阿三】 + */ @Test - public void densityAndGreedMatchTest2(){ + public void densityAndGreedMatchTest2() { final WordTree tree = new WordTree(); tree.addWord("赵"); tree.addWord("赵阿"); tree.addWord("赵阿三"); final List result = tree.matchAllWords("赵阿三在做什么", -1, true, true); - Assertions.assertEquals(3, result.size()); + Assertions.assertEquals(1, result.size()); - Assertions.assertEquals("赵", result.get(0).getWord()); + Assertions.assertEquals("赵阿三", result.get(0).getWord()); Assertions.assertEquals(0, result.get(0).getBeginIndex().intValue()); - Assertions.assertEquals(0, result.get(0).getEndIndex().intValue()); - - Assertions.assertEquals("赵阿", result.get(1).getWord()); - Assertions.assertEquals(0, result.get(1).getBeginIndex().intValue()); - Assertions.assertEquals(1, result.get(1).getEndIndex().intValue()); - - Assertions.assertEquals("赵阿三", result.get(2).getWord()); - Assertions.assertEquals(0, result.get(2).getBeginIndex().intValue()); - Assertions.assertEquals(2, result.get(2).getEndIndex().intValue()); + Assertions.assertEquals(2, result.get(0).getEndIndex().intValue()); } /** @@ -128,7 +125,7 @@ public class DfaTest { } @Test - public void aTest(){ + public void aTest() { final WordTree tree = new WordTree(); tree.addWord("women"); final String text = "a WOMEN todo.".toLowerCase(); @@ -137,13 +134,13 @@ public class DfaTest { } @Test - public void clearTest(){ + public void clearTest() { WordTree tree = new WordTree(); tree.addWord("黑"); Assertions.assertTrue(tree.matchAll("黑大衣").contains("黑")); //clear时直接调用Map的clear并没有把endCharacterSet清理掉 tree.clear(); - tree.addWords("黑大衣","红色大衣"); + tree.addWords("黑大衣", "红色大衣"); //clear() 覆写前 这里想匹配到黑大衣,但是却匹配到了黑 // Assertions.assertFalse(tree.matchAll("黑大衣").contains("黑大衣")); @@ -155,12 +152,13 @@ public class DfaTest { //如果不覆写只能通过new出新对象才不会有问题 tree = new WordTree(); - tree.addWords("黑大衣","红色大衣"); + tree.addWords("黑大衣", "红色大衣"); Assertions.assertTrue(tree.matchAll("黑大衣").contains("黑大衣")); Assertions.assertTrue(tree.matchAll("红色大衣").contains("红色大衣")); } // ---------------------------------------------------------------------------------------------------------- + /** * 构建查找树 * @@ -176,4 +174,39 @@ public class DfaTest { tree.addWord("出锅"); return tree; } + + @Test + void issueI8LAEWTest() { + final WordTree wordTree = new WordTree(); + wordTree.addWords("UserServiceImpl", "UserService"); + + final String text = "This is test Service: UserServiceImpl UserServiceTest..."; + final List strings = wordTree.matchAll(text, -1, false, true); + Assertions.assertEquals("[UserServiceImpl, UserService]", strings.toString()); + } + + /** + * 此测试验证边界问题,当最后一个字符匹配时的问题 + */ + @Test + void matchAbTest() { + final WordTree wordTree = new WordTree(); + wordTree.addWords("ab", "b"); + + // 非密集,非贪婪 + List strings = wordTree.matchAll("abab", -1, false, false); + Assertions.assertEquals("[ab, ab]", strings.toString()); + + // 密集,非贪婪 + strings = wordTree.matchAll("abab", -1, true, false); + Assertions.assertEquals("[ab, b, ab, b]", strings.toString()); + + // 非密集,贪婪 + strings = wordTree.matchAll("abab", -1, false, true); + Assertions.assertEquals("[ab, ab]", strings.toString()); + + // 密集,贪婪 + strings = wordTree.matchAll("abab", -1, true, true); + Assertions.assertEquals("[ab, b, ab, b]", strings.toString()); + } }