From 1578a4676146c1fed687050f29dc81f64038fdfb Mon Sep 17 00:00:00 2001 From: Looly Date: Sat, 10 Aug 2024 09:00:14 +0800 Subject: [PATCH] =?UTF-8?q?AC=E8=87=AA=E5=8A=A8=E6=9C=BA=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hutool/core/text/CharSequenceUtil.java | 53 ++++--- .../core/text/finder/MultiStrFinder.java | 129 +++++++++--------- .../text/replacer/HighMultiReplacerV2.java | 14 +- 3 files changed, 111 insertions(+), 85 deletions(-) diff --git a/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java b/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java index 0b740f36c..145044ae4 100644 --- a/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java +++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/CharSequenceUtil.java @@ -43,10 +43,7 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.function.Supplier; -import java.util.function.UnaryOperator; +import java.util.function.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -1476,9 +1473,9 @@ public class CharSequenceUtil extends StrValidator { * } * * - * @param str 被处理的字符串 - * @param prefix 前缀 - * @param suffix 后缀 + * @param str 被处理的字符串 + * @param prefix 前缀 + * @param suffix 后缀 * @param ignoreCase 是否忽略大小写 * @return 处理后的字符串 * @since 3.1.2 @@ -1494,17 +1491,17 @@ public class CharSequenceUtil extends StrValidator { if (startWith(str2, prefix, ignoreCase)) { from = prefix.length(); - if(from == to){ + if (from == to) { // "a", "a", "a" -> "" return EMPTY; } } if (endWith(str2, suffix, ignoreCase)) { to -= suffix.length(); - if(from == to){ + if (from == to) { // "a", "a", "a" -> "" return EMPTY; - } else if(to < from){ + } else if (to < from) { // pre去除后和suffix有重叠,如 ("aba", "ab", "ba") -> "a" to += suffix.length(); } @@ -1574,22 +1571,22 @@ public class CharSequenceUtil extends StrValidator { int from = 0; int to = str2.length(); - if(!prefixStr.isEmpty()){ + if (!prefixStr.isEmpty()) { while (str2.startsWith(prefixStr, from)) { from += prefix.length(); - if(from == to){ + if (from == to) { // "a", "a", "a" -> "" return EMPTY; } } } - if(!suffixStr.isEmpty()){ + if (!suffixStr.isEmpty()) { while (str2.startsWith(suffixStr, to - suffixStr.length())) { to -= suffixStr.length(); - if(from == to){ + if (from == to) { // "a", "a", "a" -> "" return EMPTY; - }else if(to < from){ + } else if (to < from) { // pre去除后和suffix有重叠,如 ("aba", "ab", "ba") -> "a" to += suffixStr.length(); break; @@ -1730,7 +1727,7 @@ public class CharSequenceUtil extends StrValidator { final StringBuilder sb = new StringBuilder(); final int subLen = toIndex - fromIndex; - str.toString().codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v))); + str.codePoints().skip(fromIndex).limit(subLen).forEach(v -> sb.append(Character.toChars(v))); return sb.toString(); } @@ -4131,4 +4128,28 @@ public class CharSequenceUtil extends StrValidator { } return (isCodePoint ? str.codePoints() : str.chars()).toArray(); } + + /** + * 遍历字符串的每个字符,并处理 + * + * @param str 字符串 + * @param consumer 字符处理 + */ + public static void forEach(final CharSequence str, final Consumer consumer) { + forEach(str, false, (cInt)-> consumer.accept((char) cInt)); + } + + /** + * 遍历字符串的每个字符,并处理 + * + * @param str 字符串 + * @param isCodePoint 是否为Unicode码点(即支持emoji等多char字符) + * @param consumer 字符处理 + */ + public static void forEach(final CharSequence str, final boolean isCodePoint, final IntConsumer consumer) { + if (null == str) { + return; + } + (isCodePoint ? str.codePoints() : str.chars()).forEach(consumer); + } } diff --git a/hutool-core/src/main/java/org/dromara/hutool/core/text/finder/MultiStrFinder.java b/hutool-core/src/main/java/org/dromara/hutool/core/text/finder/MultiStrFinder.java index 9f2155881..1d3e3c4e4 100644 --- a/hutool-core/src/main/java/org/dromara/hutool/core/text/finder/MultiStrFinder.java +++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/finder/MultiStrFinder.java @@ -1,51 +1,56 @@ package org.dromara.hutool.core.text.finder; +import org.dromara.hutool.core.text.StrUtil; + import java.util.*; /** * 多字符串查询器 底层思路 使用 AC 自动机实现 + * * @author newshiJ - * @date 2024/8/2 上午10:07 */ public class MultiStrFinder { - // 字符索引 - protected final Map charIndex = new HashMap<>(); + /** + * 创建多字符串查询器 + * @param source 字符串集合 + * @return 多字符串查询器 + */ + public static MultiStrFinder of(final Collection source) { + return new MultiStrFinder(source); + } + // 字符索引 + protected final Map charIndexMap = new HashMap<>(); // 全部字符数量 protected final int allCharSize; - // 根节点 protected final Node root; - // 全部节点数量 int nodeSize; /** * 构建多字符串查询器 - * @param source + * + * @param source 字符串集合 */ - public MultiStrFinder(Collection source){ + public MultiStrFinder(final Collection source) { // 待匹配的字符串 final Set stringSet = new HashSet<>(); // 所有字符 final Set charSet = new HashSet<>(); - for (String string : source) { + for (final String string : source) { stringSet.add(string); - char[] charArray = string.toCharArray(); - for (char c : charArray) { - charSet.add(c); - } + StrUtil.forEach(string, charSet::add); } allCharSize = charSet.size(); int index = 0; - for (Character c : charSet) { - charIndex.put(c,index); + for (final Character c : charSet) { + charIndexMap.put(c,index); index ++; } - - root = Node.createRoot(allCharSize); + this.root = Node.createRoot(index); buildPrefixTree(stringSet); buildFail(); @@ -53,19 +58,18 @@ public class MultiStrFinder { /** * 构建前缀树 + * * @param stringSst 待匹配的字符串 */ - protected void buildPrefixTree(Collection stringSst){ + protected void buildPrefixTree(final Collection stringSst) { // 节点编号 根节点已经是0了 所以从 1开始编号 int nodeIndex = 1; - for (String string : stringSst) { + for (final String string : stringSst) { Node node = root; - char[] charArray = string.toCharArray(); - for (int i = 0; i < charArray.length; i++) { - char c = charArray[i]; - boolean addValue = node.addValue(c, nodeIndex, charIndex); - if(addValue){ - nodeIndex ++; + for (final char c : string.toCharArray()) { + final boolean addValue = node.addValue(c, nodeIndex, charIndexMap); + if (addValue) { + nodeIndex++; } node = node.directRouter[getIndex(c)]; } @@ -78,11 +82,11 @@ public class MultiStrFinder { * 构建 fail指针过程 * 构建 directRouter 直接访问路由表 减少跳fail次数 直接跳 router 边 */ - protected void buildFail(){ - LinkedList nodeQueue = new LinkedList<>(); + protected void buildFail() { + final LinkedList nodeQueue = new LinkedList<>(); for (int i = 0; i < root.directRouter.length; i++) { - Node nextNode = root.directRouter[i]; - if(nextNode == null){ + final Node nextNode = root.directRouter[i]; + if (nextNode == null) { root.directRouter[i] = root; continue; } @@ -91,13 +95,13 @@ public class MultiStrFinder { } // 进行广度优先遍历 - while (!nodeQueue.isEmpty()){ - Node parent = nodeQueue.removeFirst(); + while (!nodeQueue.isEmpty()) { + final Node parent = nodeQueue.removeFirst(); // 因为 使用了 charIndex 进行字符到下标的映射 i 可以直接认为就是对应字符 char for (int i = 0; i < parent.directRouter.length; i++) { - Node child = parent.directRouter[i]; + final Node child = parent.directRouter[i]; // child 为 null 表示没有子节点 - if(child == null){ + if (child == null) { parent.directRouter[i] = parent.fail.directRouter[i]; continue; } @@ -110,27 +114,28 @@ public class MultiStrFinder { /** * 查询匹配的字符串 + * * @param text 返回每个匹配的 字符串 value是字符首字母地址 - * @return + * @return 匹配结果 */ - public Map> findMatch(String text){ + public Map> findMatch(final String text) { // 节点经过次数 放在方法内部声明变量 希望可以一个构建对象 进行多次匹配 - HashMap> resultMap = new HashMap<>(); + final HashMap> resultMap = new HashMap<>(); - char[] chars = text.toCharArray(); + final char[] chars = text.toCharArray(); Node currentNode = root; for (int i = 0; i < chars.length; i++) { - char c = chars[i]; - Integer index = charIndex.get(c); + final char c = chars[i]; + final Integer index = charIndexMap.get(c); // 找不到字符索引 认为一定不在匹配字符中存在 直接从根节点开始重新计算 - if(index == null){ + if (index == null) { currentNode = root; continue; } // 进入下一跳 可能是正常下一跳 也可能是fail加上后的 下一跳 currentNode = currentNode.directRouter[index]; // 判断是否尾部节点 是尾节点 说明已经匹配到了完整的字符串 将匹配结果写入返回对象 - if(currentNode.isEnd){ + if (currentNode.isEnd) { resultMap.computeIfAbsent(currentNode.tagetString, k -> new ArrayList<>()) .add(i - currentNode.tagetString.length() + 1); } @@ -143,22 +148,19 @@ public class MultiStrFinder { /** * 获取字符 下标 - * @param c - * @return + * + * @param c 字符 + * @return 下标 */ - protected int getIndex(char c){ - Integer i = charIndex.get(c); - if(i == null){ + protected int getIndex(final char c) { + final Integer i = charIndexMap.get(c); + if (i == null) { return -1; } return i; } - public static MultiStrFinder create(Collection source){ - return new MultiStrFinder(source); - } - /** * AC 自动机节点 */ @@ -189,19 +191,21 @@ public class MultiStrFinder { // fail指针来源 public List failPre = new ArrayList<>(); - public Node(){} + public Node() { + } /** * 新增子节点 - * @param c 字符 + * + * @param c 字符 * @param nodeIndex 节点编号 * @param charIndex 字符索引 * @return 如果已经存在子节点 false 新增 ture */ - public boolean addValue(char c, int nodeIndex ,Map charIndex){ - Integer index = charIndex.get(c); + public boolean addValue(final char c, final int nodeIndex, final Map charIndex) { + final Integer index = charIndex.get(c); Node node = directRouter[index]; - if(node != null){ + if (node != null) { return false; } node = new Node(); @@ -214,22 +218,24 @@ public class MultiStrFinder { /** * 标记当前节点为 字符串尾节点 + * * @param string */ - public void setEnd(String string){ + public void setEnd(final String string) { tagetString = string; isEnd = true; } /** * 获取下一跳 - * @param c 字符 + * + * @param c 字符 * @param charIndex 字符索引 * @return */ - public Node getNext(char c,Map charIndex){ - Integer index = charIndex.get(c); - if(index == null){ + public Node getNext(final char c, final Map charIndex) { + final Integer index = charIndex.get(c); + if (index == null) { return null; } return directRouter[index]; @@ -237,11 +243,12 @@ public class MultiStrFinder { /** * 构建根节点 + * * @param allCharSize 全部字符数量 * @return */ - public static Node createRoot(int allCharSize){ - Node node = new Node(); + public static Node createRoot(final int allCharSize) { + final Node node = new Node(); node.nodeIndex = 0; node.fail = node; node.directRouter = new Node[allCharSize]; diff --git a/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerV2.java b/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerV2.java index 3f79eae24..e3fb2156a 100644 --- a/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerV2.java +++ b/hutool-core/src/main/java/org/dromara/hutool/core/text/replacer/HighMultiReplacerV2.java @@ -14,9 +14,9 @@ import java.util.*; * 3、"abc", "bc" 会优先替换"abc" * * @author newshiJ - * @date 2024/8/2 下午3:41 */ public class HighMultiReplacerV2 extends StrReplacer { + private static final long serialVersionUID = 1L; private final AhoCorasickAutomaton ahoCorasickAutomaton; @@ -49,19 +49,17 @@ public class HighMultiReplacerV2 extends StrReplacer { protected static class AhoCorasickAutomaton extends MultiStrFinder{ protected final Map replaceMap; - public AhoCorasickAutomaton(Map replaceMap){ + public AhoCorasickAutomaton(final Map replaceMap){ super(replaceMap.keySet()); this.replaceMap = replaceMap; } - - public void replace(final CharSequence text, final StringBuilder stringBuilder){ Node currentNode = root; // 临时字符串存储空间 - StringBuilder temp = new StringBuilder(); + final StringBuilder temp = new StringBuilder(); for (int i = 0; i < text.length(); i++) { - char ch = text.charAt(i); - Integer index = charIndex.get(ch); + final char ch = text.charAt(i); + final Integer index = charIndexMap.get(ch); // 下一个字符在候选转换字符串中都不存在 ch字符一定不会被替换 if(index < 0){ // 临时缓存空间中的数据写入到输出的 StringBuilder @@ -94,7 +92,7 @@ public class HighMultiReplacerV2 extends StrReplacer { // 表示匹配到 现在进行字符串替换工作 if(currentNode.isEnd){ - int length = currentNode.tagetString.length(); + final int length = currentNode.tagetString.length(); // 先清理匹配到的字符 最后一个字符未加入临时空间 temp.delete(temp.length() - length + 1,length - 1); if(temp.length() > 0){