mirror of
https://gitee.com/chinabugotech/hutool.git
synced 2025-05-09 23:51:34 +08:00
change longest common text
This commit is contained in:
parent
0026ffff93
commit
74b4b68bb0
@ -12,6 +12,7 @@
|
||||
* 【json 】 JSONGetter增加getLocalDateTime方法(pr#387@Gitee)
|
||||
* 【core 】 增加JNDIUtil(issue#1727@Github)
|
||||
* 【core 】 SpringUtil增加unregisterBean方法(pr#388@Gitee)
|
||||
* 【core 】 优化TextSimilarity公共子串算法(issue#I42A6V@Gitee)
|
||||
|
||||
### 🐞Bug修复
|
||||
* 【jwt 】 修复JWTUtil中几个方法非static的问题(issue#1735@Github)
|
||||
|
@ -13,7 +13,12 @@ import cn.hutool.core.util.StrUtil;
|
||||
public class TextSimilarity {
|
||||
|
||||
/**
|
||||
* 计算相似度,两个都是空串相似度为1,被认为是相同的串
|
||||
* 计算相似度,两个都是空串相似度为1,被认为是相同的串<br>
|
||||
* 比较方法为:
|
||||
* <ul>
|
||||
* <li>只比较两个字符串字母、数字、汉字部分,其他符号去除</li>
|
||||
* <li>计算出两个字符串最大子串,除以最长的字符串,结果即为相似度</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param strA 字符串1
|
||||
* @param strB 字符串2
|
||||
@ -36,8 +41,8 @@ public class TextSimilarity {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int temp2 = longestCommonSubstring(newStrA, newStrB).length();
|
||||
return NumberUtil.div(temp2, temp);
|
||||
final int commonLength = longestCommonSubstringLength(newStrA, newStrB);
|
||||
return NumberUtil.div(commonLength, temp);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -52,6 +57,40 @@ public class TextSimilarity {
|
||||
return NumberUtil.formatPercent(similar(strA, strB), scale);
|
||||
}
|
||||
|
||||
/**
|
||||
* 最长公共子串,采用动态规划算法。 其不要求所求得的字符在所给的字符串中是连续的。<br>
|
||||
* 算法解析见:https://leetcode-cn.com/problems/longest-common-subsequence/solution/zui-chang-gong-gong-zi-xu-lie-by-leetcod-y7u0/
|
||||
*
|
||||
* @param strA 字符串1
|
||||
* @param strB 字符串2
|
||||
* @return 最长公共子串
|
||||
*/
|
||||
public static String longestCommonSubstring(String strA, String strB) {
|
||||
// 初始化矩阵数据,matrix[0][0]的值为0, 如果字符数组chars_strA和chars_strB的对应位相同,则matrix[i][j]的值为左上角的值加1,
|
||||
// 否则,matrix[i][j]的值等于左上方最近两个位置的较大值, 矩阵中其余各点的值为0.
|
||||
final int[][] matrix = generateMatrix(strA, strB);
|
||||
|
||||
int m = strA.length();
|
||||
int n = strB.length();
|
||||
// 矩阵中,如果matrix[m][n]的值不等于matrix[m-1][n]的值也不等于matrix[m][n-1]的值,
|
||||
// 则matrix[m][n]对应的字符为相似字符元,并将其存入result数组中。
|
||||
char[] result = new char[matrix[m][n]];
|
||||
int currentIndex = result.length - 1;
|
||||
while (matrix[m][n] != 0) {
|
||||
if (matrix[m][n] == matrix[m][n - 1]) {
|
||||
n--;
|
||||
} else if (matrix[m][n] == matrix[m - 1][n]) {
|
||||
m--;
|
||||
} else {
|
||||
result[currentIndex] = strA.charAt(m - 1);
|
||||
currentIndex--;
|
||||
n--;
|
||||
m--;
|
||||
}
|
||||
}
|
||||
return new String(result);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------- Private method start
|
||||
/**
|
||||
* 将字符串的所有数据依次写成一行,去除无意义字符串
|
||||
@ -94,7 +133,20 @@ public class TextSimilarity {
|
||||
* @param strB 字符串2
|
||||
* @return 公共子串
|
||||
*/
|
||||
private static String longestCommonSubstring(String strA, String strB) {
|
||||
private static int longestCommonSubstringLength(String strA, String strB) {
|
||||
final int m = strA.length();
|
||||
final int n = strB.length();
|
||||
return generateMatrix(strA, strB)[m][n];
|
||||
}
|
||||
|
||||
/**
|
||||
* 求公共子串,采用动态规划算法。 其不要求所求得的字符在所给的字符串中是连续的。
|
||||
*
|
||||
* @param strA 字符串1
|
||||
* @param strB 字符串2
|
||||
* @return 公共串矩阵
|
||||
*/
|
||||
private static int[][] generateMatrix(String strA, String strB) {
|
||||
int m = strA.length();
|
||||
int n = strB.length();
|
||||
|
||||
@ -111,23 +163,7 @@ public class TextSimilarity {
|
||||
}
|
||||
}
|
||||
|
||||
// 矩阵中,如果matrix[m][n]的值不等于matrix[m-1][n]的值也不等于matrix[m][n-1]的值,
|
||||
// 则matrix[m][n]对应的字符为相似字符元,并将其存入result数组中。
|
||||
char[] result = new char[matrix[m][n]];
|
||||
int currentIndex = result.length - 1;
|
||||
while (matrix[m][n] != 0) {
|
||||
if (matrix[m][n] == matrix[m][n - 1]) {
|
||||
n--;
|
||||
} else if (matrix[m][n] == matrix[m - 1][n]) {
|
||||
m--;
|
||||
} else {
|
||||
result[currentIndex] = strA.charAt(m - 1);
|
||||
currentIndex--;
|
||||
n--;
|
||||
m--;
|
||||
}
|
||||
}
|
||||
return new String(result);
|
||||
return matrix;
|
||||
}
|
||||
// --------------------------------------------------------------------------------------------------- Private method end
|
||||
}
|
||||
|
@ -9,15 +9,27 @@ import org.junit.Test;
|
||||
*
|
||||
*/
|
||||
public class TextSimilarityTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void similarDegreeTest() {
|
||||
String a = "我是一个文本,独一无二的文本";
|
||||
String b = "一个文本,独一无二的文本";
|
||||
|
||||
|
||||
double degree = TextSimilarity.similar(a, b);
|
||||
Assert.assertEquals(0.8571428571428571D, degree, 16);
|
||||
|
||||
|
||||
String similarPercent = TextSimilarity.similar(a, b, 2);
|
||||
Assert.assertEquals("84.62%", similarPercent);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void similarDegreeTest2() {
|
||||
String a = "我是一个文本,独一无二的文本";
|
||||
String b = "一个文本,独一无二的文本,#,>>?#$%^%$&^&^%";
|
||||
|
||||
double degree = TextSimilarity.similar(a, b);
|
||||
Assert.assertEquals(0.8571428571428571D, degree, 16);
|
||||
|
||||
String similarPercent = TextSimilarity.similar(a, b, 2);
|
||||
Assert.assertEquals("84.62%", similarPercent);
|
||||
}
|
||||
@ -27,4 +39,4 @@ public class TextSimilarityTest {
|
||||
final double abd = TextSimilarity.similar("abd", "1111");
|
||||
Assert.assertEquals(0, abd, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user