From 74b4b68bb04c185b9cab770d5dafac83d326c00b Mon Sep 17 00:00:00 2001 From: Looly Date: Sun, 1 Aug 2021 20:54:28 +0800 Subject: [PATCH] change longest common text --- CHANGELOG.md | 1 + .../cn/hutool/core/text/TextSimilarity.java | 78 ++++++++++++++----- .../hutool/core/text/TextSimilarityTest.java | 20 ++++- 3 files changed, 74 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ec8658dc..a254d6416 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ * 【json 】 JSONGetter增加getLocalDateTime方法(pr#387@Gitee) * 【core 】 增加JNDIUtil(issue#1727@Github) * 【core 】 SpringUtil增加unregisterBean方法(pr#388@Gitee) +* 【core 】 优化TextSimilarity公共子串算法(issue#I42A6V@Gitee) ### 🐞Bug修复 * 【jwt 】 修复JWTUtil中几个方法非static的问题(issue#1735@Github) diff --git a/hutool-core/src/main/java/cn/hutool/core/text/TextSimilarity.java b/hutool-core/src/main/java/cn/hutool/core/text/TextSimilarity.java index e0cefe74c..33e1ecfa0 100644 --- a/hutool-core/src/main/java/cn/hutool/core/text/TextSimilarity.java +++ b/hutool-core/src/main/java/cn/hutool/core/text/TextSimilarity.java @@ -13,7 +13,12 @@ import cn.hutool.core.util.StrUtil; public class TextSimilarity { /** - * 计算相似度,两个都是空串相似度为1,被认为是相同的串 + * 计算相似度,两个都是空串相似度为1,被认为是相同的串
+ * 比较方法为: + * * * @param strA 字符串1 * @param strB 字符串2 @@ -36,8 +41,8 @@ public class TextSimilarity { return 1; } - int temp2 = longestCommonSubstring(newStrA, newStrB).length(); - return NumberUtil.div(temp2, temp); + final int commonLength = longestCommonSubstringLength(newStrA, newStrB); + return NumberUtil.div(commonLength, temp); } /** @@ -52,6 +57,40 @@ public class TextSimilarity { return NumberUtil.formatPercent(similar(strA, strB), scale); } + /** + * 最长公共子串,采用动态规划算法。 其不要求所求得的字符在所给的字符串中是连续的。
+ * 算法解析见:https://leetcode-cn.com/problems/longest-common-subsequence/solution/zui-chang-gong-gong-zi-xu-lie-by-leetcod-y7u0/ + * + * @param strA 字符串1 + * @param strB 字符串2 + * @return 最长公共子串 + */ + public static String longestCommonSubstring(String strA, String strB) { + // 初始化矩阵数据,matrix[0][0]的值为0, 如果字符数组chars_strA和chars_strB的对应位相同,则matrix[i][j]的值为左上角的值加1, + // 否则,matrix[i][j]的值等于左上方最近两个位置的较大值, 矩阵中其余各点的值为0. + final int[][] matrix = generateMatrix(strA, strB); + + int m = strA.length(); + int n = strB.length(); + // 矩阵中,如果matrix[m][n]的值不等于matrix[m-1][n]的值也不等于matrix[m][n-1]的值, + // 则matrix[m][n]对应的字符为相似字符元,并将其存入result数组中。 + char[] result = new char[matrix[m][n]]; + int currentIndex = result.length - 1; + while (matrix[m][n] != 0) { + if (matrix[m][n] == matrix[m][n - 1]) { + n--; + } else if (matrix[m][n] == matrix[m - 1][n]) { + m--; + } else { + result[currentIndex] = strA.charAt(m - 1); + currentIndex--; + n--; + m--; + } + } + return new String(result); + } + // --------------------------------------------------------------------------------------------------- Private method start /** * 将字符串的所有数据依次写成一行,去除无意义字符串 @@ -94,7 +133,20 @@ public class TextSimilarity { * @param strB 字符串2 * @return 公共子串 */ - private static String longestCommonSubstring(String strA, String strB) { + private static int longestCommonSubstringLength(String strA, String strB) { + final int m = strA.length(); + final int n = strB.length(); + return generateMatrix(strA, strB)[m][n]; + } + + /** + * 求公共子串,采用动态规划算法。 其不要求所求得的字符在所给的字符串中是连续的。 + * + * @param strA 字符串1 + * @param strB 字符串2 + * @return 公共串矩阵 + */ + private static int[][] generateMatrix(String strA, String strB) { int m = strA.length(); int n = strB.length(); @@ -111,23 +163,7 @@ public class TextSimilarity { } } - // 矩阵中,如果matrix[m][n]的值不等于matrix[m-1][n]的值也不等于matrix[m][n-1]的值, - // 则matrix[m][n]对应的字符为相似字符元,并将其存入result数组中。 - char[] result = new char[matrix[m][n]]; - int currentIndex = result.length - 1; - while (matrix[m][n] != 0) { - if (matrix[m][n] == matrix[m][n - 1]) { - n--; - } else if (matrix[m][n] == matrix[m - 1][n]) { - m--; - } else { - result[currentIndex] = strA.charAt(m - 1); - currentIndex--; - n--; - m--; - } - } - return new String(result); + return matrix; } // --------------------------------------------------------------------------------------------------- Private method end } diff --git a/hutool-core/src/test/java/cn/hutool/core/text/TextSimilarityTest.java b/hutool-core/src/test/java/cn/hutool/core/text/TextSimilarityTest.java index b7d7d433e..c3933ad20 100644 --- a/hutool-core/src/test/java/cn/hutool/core/text/TextSimilarityTest.java +++ b/hutool-core/src/test/java/cn/hutool/core/text/TextSimilarityTest.java @@ -9,15 +9,27 @@ import org.junit.Test; * */ public class TextSimilarityTest { - + @Test public void similarDegreeTest() { String a = "我是一个文本,独一无二的文本"; String b = "一个文本,独一无二的文本"; - + double degree = TextSimilarity.similar(a, b); Assert.assertEquals(0.8571428571428571D, degree, 16); - + + String similarPercent = TextSimilarity.similar(a, b, 2); + Assert.assertEquals("84.62%", similarPercent); + } + + @Test + public void similarDegreeTest2() { + String a = "我是一个文本,独一无二的文本"; + String b = "一个文本,独一无二的文本,#,>>?#$%^%$&^&^%"; + + double degree = TextSimilarity.similar(a, b); + Assert.assertEquals(0.8571428571428571D, degree, 16); + String similarPercent = TextSimilarity.similar(a, b, 2); Assert.assertEquals("84.62%", similarPercent); } @@ -27,4 +39,4 @@ public class TextSimilarityTest { final double abd = TextSimilarity.similar("abd", "1111"); Assert.assertEquals(0, abd, 1); } -} \ No newline at end of file +}