增加HtmlUtil.cleanEmptyTag方法(pr#3838@Github)

This commit is contained in:
Looly 2025-01-10 09:58:35 +08:00
parent b8ba60df50
commit 60b1413aa6
2 changed files with 79 additions and 40 deletions

View File

@ -41,6 +41,10 @@ public class HtmlUtil {
* HTML标签正则
*/
public static final Pattern RE_HTML_MARK = Pattern.compile("(<[^<]*?>)|(<\\s*?/[^<]*?>)|(<[^<]*?/\\s*?>)", Pattern.CASE_INSENSITIVE);
/**
* 正则匹配空标签
*/
public static final String RE_HTML_EMPTY_MARK = "<(\\w+)([^>]*)>\\s*</\\1>";
/**
* script标签正则
*/
@ -111,6 +115,17 @@ public class HtmlUtil {
return ReUtil.replaceAll(content, RE_HTML_MARK, StrUtil.EMPTY);
}
/**
* 清除所有HTML空标签<br>
* 例如{@code <p></p>}
*
* @param content 文本
* @return 清除空标签后的文本
*/
public static String cleanEmptyTag(final String content) {
return content.replaceAll(RE_HTML_EMPTY_MARK, StrUtil.EMPTY);
}
/**
* 清除所有script标签包括内容
*

View File

@ -18,9 +18,10 @@ package org.dromara.hutool.http.html;
import org.dromara.hutool.core.regex.ReUtil;
import org.dromara.hutool.http.meta.ContentTypeUtil;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
/**
* Html单元测试
*
@ -34,32 +35,32 @@ public class HtmlUtilTest {
//非闭合标签
String str = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
String result = HtmlUtil.removeHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img>";
result = HtmlUtil.removeHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img src=\"xxx/dfdsfds/test.jpg\" />";
result = HtmlUtil.removeHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img />";
result = HtmlUtil.removeHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//包含内容标签
str = "pre<div class=\"test_div\">dfdsfdsfdsf</div>";
result = HtmlUtil.removeHtmlTag(str, "div");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//带换行
str = "pre<div class=\"test_div\">\r\n\t\tdfdsfdsfdsf\r\n</div>";
result = HtmlUtil.removeHtmlTag(str, "div");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
}
@Test
@ -67,32 +68,32 @@ public class HtmlUtilTest {
//非闭合标签
String str = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
String result = HtmlUtil.cleanHtmlTag(str);
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img>";
result = HtmlUtil.cleanHtmlTag(str);
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img src=\"xxx/dfdsfds/test.jpg\" />";
result = HtmlUtil.cleanHtmlTag(str);
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img />";
result = HtmlUtil.cleanHtmlTag(str);
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//包含内容标签
str = "pre<div class=\"test_div\">dfdsfdsfdsf</div>";
result = HtmlUtil.cleanHtmlTag(str);
Assertions.assertEquals("predfdsfdsfdsf", result);
assertEquals("predfdsfdsfdsf", result);
//带换行
str = "pre<div class=\"test_div\">\r\n\t\tdfdsfdsfdsf\r\n</div><div class=\"test_div\">BBBB</div>";
result = HtmlUtil.cleanHtmlTag(str);
Assertions.assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result);
}
@Test
@ -100,37 +101,37 @@ public class HtmlUtilTest {
//非闭合标签
String str = "pre<img src=\"xxx/dfdsfds/test.jpg\">";
String result = HtmlUtil.unwrapHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img>";
result = HtmlUtil.unwrapHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img src=\"xxx/dfdsfds/test.jpg\" />";
result = HtmlUtil.unwrapHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img />";
result = HtmlUtil.unwrapHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//闭合标签
str = "pre<img/>";
result = HtmlUtil.unwrapHtmlTag(str, "img");
Assertions.assertEquals("pre", result);
assertEquals("pre", result);
//包含内容标签
str = "pre<div class=\"test_div\">abc</div>";
result = HtmlUtil.unwrapHtmlTag(str, "div");
Assertions.assertEquals("preabc", result);
assertEquals("preabc", result);
//带换行
str = "pre<div class=\"test_div\">\r\n\t\tabc\r\n</div>";
result = HtmlUtil.unwrapHtmlTag(str, "div");
Assertions.assertEquals("pre\r\n\t\tabc\r\n", result);
assertEquals("pre\r\n\t\tabc\r\n", result);
}
@Test
@ -139,34 +140,34 @@ public class HtmlUtilTest {
final String htmlString = "<html><img src='aaa'><i>测试文本</i></html>";
final String tagString = "i,br";
final String cleanTxt = HtmlUtil.removeHtmlTag(htmlString, false, tagString.split(","));
Assertions.assertEquals("<html><img src='aaa'>测试文本</html>", cleanTxt);
assertEquals("<html><img src='aaa'>测试文本</html>", cleanTxt);
}
@Test
public void escapeTest() {
final String html = "<html><body>123'123'</body></html>";
final String escape = HtmlUtil.escape(html);
Assertions.assertEquals("&lt;html&gt;&lt;body&gt;123&#039;123&#039;&lt;/body&gt;&lt;/html&gt;", escape);
assertEquals("&lt;html&gt;&lt;body&gt;123&#039;123&#039;&lt;/body&gt;&lt;/html&gt;", escape);
final String restoreEscaped = HtmlUtil.unescape(escape);
Assertions.assertEquals(html, restoreEscaped);
Assertions.assertEquals("'", HtmlUtil.unescape("&apos;"));
assertEquals(html, restoreEscaped);
assertEquals("'", HtmlUtil.unescape("&apos;"));
}
@Test
public void escapeTest2() {
final char c = ' '; // 不断开空格non-breaking space缩写nbsp)
Assertions.assertEquals(c, 160);
assertEquals(c, 160);
final String html = "<html><body> </body></html>";
final String escape = HtmlUtil.escape(html);
Assertions.assertEquals("&lt;html&gt;&lt;body&gt;&nbsp;&lt;/body&gt;&lt;/html&gt;", escape);
Assertions.assertEquals(" ", HtmlUtil.unescape("&nbsp;"));
assertEquals("&lt;html&gt;&lt;body&gt;&nbsp;&lt;/body&gt;&lt;/html&gt;", escape);
assertEquals(" ", HtmlUtil.unescape("&nbsp;"));
}
@Test
public void filterTest() {
final String html = "<alert></alert>";
final String filter = HtmlUtil.filter(html);
Assertions.assertEquals("", filter);
assertEquals("", filter);
}
@Test
@ -175,54 +176,77 @@ public class HtmlUtilTest {
// 去除的属性加双引号测试
String html = "<div class=\"test_div\"></div><span class=\"test_div\"></span>";
String result = HtmlUtil.removeHtmlAttr(html, "class");
Assertions.assertEquals("<div></div><span></span>", result);
assertEquals("<div></div><span></span>", result);
// 去除的属性后跟空格加单引号不加引号测试
html = "<div class=test_div></div><span Class='test_div' ></span>";
result = HtmlUtil.removeHtmlAttr(html, "class");
Assertions.assertEquals("<div></div><span></span>", result);
assertEquals("<div></div><span></span>", result);
// 去除的属性位于标签末尾其它属性前测试
html = "<div style=\"margin:100%\" class=test_div></div><span Class='test_div' width=100></span>";
result = HtmlUtil.removeHtmlAttr(html, "class");
Assertions.assertEquals("<div style=\"margin:100%\"></div><span width=100></span>", result);
assertEquals("<div style=\"margin:100%\"></div><span width=100></span>", result);
// 去除的属性名和值之间存在空格
html = "<div style = \"margin:100%\" class = test_div></div><span Class = 'test_div' width=100></span>";
result = HtmlUtil.removeHtmlAttr(html, "class");
Assertions.assertEquals("<div style = \"margin:100%\"></div><span width=100></span>", result);
assertEquals("<div style = \"margin:100%\"></div><span width=100></span>", result);
}
@Test
public void removeAllHtmlAttrTest() {
final String html = "<div class=\"test_div\" width=\"120\"></div>";
final String result = HtmlUtil.removeAllHtmlAttr(html, "div");
Assertions.assertEquals("<div></div>", result);
assertEquals("<div></div>", result);
}
@Test
public void getCharsetTest() {
String charsetName = ReUtil.get(ContentTypeUtil.CHARSET_PATTERN, "Charset=UTF-8;fq=0.9", 1);
Assertions.assertEquals("UTF-8", charsetName);
assertEquals("UTF-8", charsetName);
charsetName = ReUtil.get(HtmlUtil.META_CHARSET_PATTERN, "<meta charset=utf-8", 1);
Assertions.assertEquals("utf-8", charsetName);
assertEquals("utf-8", charsetName);
charsetName = ReUtil.get(HtmlUtil.META_CHARSET_PATTERN, "<meta charset='utf-8'", 1);
Assertions.assertEquals("utf-8", charsetName);
assertEquals("utf-8", charsetName);
charsetName = ReUtil.get(HtmlUtil.META_CHARSET_PATTERN, "<meta charset=\"utf-8\"", 1);
Assertions.assertEquals("utf-8", charsetName);
assertEquals("utf-8", charsetName);
charsetName = ReUtil.get(HtmlUtil.META_CHARSET_PATTERN, "<meta charset = \"utf-8\"", 1);
Assertions.assertEquals("utf-8", charsetName);
assertEquals("utf-8", charsetName);
}
@Test
void issueI6YNTFTest() {
String html = "<html><body><div class=\"a1 a2\">hello world</div></body></html>";
String cleanText = HtmlUtil.removeHtmlAttr(html,"class");
Assertions.assertEquals("<html><body><div>hello world</div></body></html>", cleanText);
assertEquals("<html><body><div>hello world</div></body></html>", cleanText);
html = "<html><body><div class=a1>hello world</div></body></html>";
cleanText = HtmlUtil.removeHtmlAttr(html,"class");
Assertions.assertEquals("<html><body><div>hello world</div></body></html>", cleanText);
assertEquals("<html><body><div>hello world</div></body></html>", cleanText);
}
@Test
public void cleanEmptyTagTest() {
String str = "<p></p><div></div>";
String result = HtmlUtil.cleanEmptyTag(str);
assertEquals("", result);
str = "<p>TEXT</p><div></div>";
result = HtmlUtil.cleanEmptyTag(str);
assertEquals("<p>TEXT</p>", result);
str = "<p></p><div>TEXT</div>";
result = HtmlUtil.cleanEmptyTag(str);
assertEquals("<div>TEXT</div>", result);
str = "<p>TEXT</p><div>TEXT</div>";
result = HtmlUtil.cleanEmptyTag(str);
assertEquals("<p>TEXT</p><div>TEXT</div>", result);
str = "TEXT<p></p><div></div>TEXT";
result = HtmlUtil.cleanEmptyTag(str);
assertEquals("TEXTTEXT", result);
}
}