From 60b1413aa6863d9c9c81de30fbbc03963241fe44 Mon Sep 17 00:00:00 2001 From: Looly Date: Fri, 10 Jan 2025 09:58:35 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0HtmlUtil.cleanEmptyTag?= =?UTF-8?q?=E6=96=B9=E6=B3=95=EF=BC=88pr#3838@Github=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../dromara/hutool/http/html/HtmlUtil.java | 15 +++ .../hutool/http/html/HtmlUtilTest.java | 104 +++++++++++------- 2 files changed, 79 insertions(+), 40 deletions(-) diff --git a/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java b/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java index 998bcd774..6cffc6e67 100644 --- a/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java +++ b/hutool-http/src/main/java/org/dromara/hutool/http/html/HtmlUtil.java @@ -41,6 +41,10 @@ public class HtmlUtil { * HTML标签正则 */ public static final Pattern RE_HTML_MARK = Pattern.compile("(<[^<]*?>)|(<\\s*?/[^<]*?>)|(<[^<]*?/\\s*?>)", Pattern.CASE_INSENSITIVE); + /** + * 正则:匹配空标签 + */ + public static final String RE_HTML_EMPTY_MARK = "<(\\w+)([^>]*)>\\s*"; /** * script标签正则 */ @@ -111,6 +115,17 @@ public class HtmlUtil { return ReUtil.replaceAll(content, RE_HTML_MARK, StrUtil.EMPTY); } + /** + * 清除所有HTML空标签
+ * 例如:{@code

} + * + * @param content 文本 + * @return 清除空标签后的文本 + */ + public static String cleanEmptyTag(final String content) { + return content.replaceAll(RE_HTML_EMPTY_MARK, StrUtil.EMPTY); + } + /** * 清除所有script标签,包括内容 * diff --git a/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java b/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java index 569c37e77..c0978a56e 100644 --- a/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java +++ b/hutool-http/src/test/java/org/dromara/hutool/http/html/HtmlUtilTest.java @@ -18,9 +18,10 @@ package org.dromara.hutool.http.html; import org.dromara.hutool.core.regex.ReUtil; import org.dromara.hutool.http.meta.ContentTypeUtil; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; + /** * Html单元测试 * @@ -34,32 +35,32 @@ public class HtmlUtilTest { //非闭合标签 String str = "pre"; String result = HtmlUtil.removeHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.removeHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.removeHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.removeHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //包含内容标签 str = "pre
dfdsfdsfdsf
"; result = HtmlUtil.removeHtmlTag(str, "div"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //带换行 str = "pre
\r\n\t\tdfdsfdsfdsf\r\n
"; result = HtmlUtil.removeHtmlTag(str, "div"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); } @Test @@ -67,32 +68,32 @@ public class HtmlUtilTest { //非闭合标签 String str = "pre"; String result = HtmlUtil.cleanHtmlTag(str); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.cleanHtmlTag(str); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.cleanHtmlTag(str); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.cleanHtmlTag(str); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //包含内容标签 str = "pre
dfdsfdsfdsf
"; result = HtmlUtil.cleanHtmlTag(str); - Assertions.assertEquals("predfdsfdsfdsf", result); + assertEquals("predfdsfdsfdsf", result); //带换行 str = "pre
\r\n\t\tdfdsfdsfdsf\r\n
BBBB
"; result = HtmlUtil.cleanHtmlTag(str); - Assertions.assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result); + assertEquals("pre\r\n\t\tdfdsfdsfdsf\r\nBBBB", result); } @Test @@ -100,37 +101,37 @@ public class HtmlUtilTest { //非闭合标签 String str = "pre"; String result = HtmlUtil.unwrapHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.unwrapHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.unwrapHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.unwrapHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //闭合标签 str = "pre"; result = HtmlUtil.unwrapHtmlTag(str, "img"); - Assertions.assertEquals("pre", result); + assertEquals("pre", result); //包含内容标签 str = "pre
abc
"; result = HtmlUtil.unwrapHtmlTag(str, "div"); - Assertions.assertEquals("preabc", result); + assertEquals("preabc", result); //带换行 str = "pre
\r\n\t\tabc\r\n
"; result = HtmlUtil.unwrapHtmlTag(str, "div"); - Assertions.assertEquals("pre\r\n\t\tabc\r\n", result); + assertEquals("pre\r\n\t\tabc\r\n", result); } @Test @@ -139,34 +140,34 @@ public class HtmlUtilTest { final String htmlString = "测试文本"; final String tagString = "i,br"; final String cleanTxt = HtmlUtil.removeHtmlTag(htmlString, false, tagString.split(",")); - Assertions.assertEquals("测试文本", cleanTxt); + assertEquals("测试文本", cleanTxt); } @Test public void escapeTest() { final String html = "123'123'"; final String escape = HtmlUtil.escape(html); - Assertions.assertEquals("<html><body>123'123'</body></html>", escape); + assertEquals("<html><body>123'123'</body></html>", escape); final String restoreEscaped = HtmlUtil.unescape(escape); - Assertions.assertEquals(html, restoreEscaped); - Assertions.assertEquals("'", HtmlUtil.unescape("'")); + assertEquals(html, restoreEscaped); + assertEquals("'", HtmlUtil.unescape("'")); } @Test public void escapeTest2() { final char c = ' '; // 不断开空格(non-breaking space,缩写nbsp。) - Assertions.assertEquals(c, 160); + assertEquals(c, 160); final String html = " "; final String escape = HtmlUtil.escape(html); - Assertions.assertEquals("<html><body> </body></html>", escape); - Assertions.assertEquals(" ", HtmlUtil.unescape(" ")); + assertEquals("<html><body> </body></html>", escape); + assertEquals(" ", HtmlUtil.unescape(" ")); } @Test public void filterTest() { final String html = ""; final String filter = HtmlUtil.filter(html); - Assertions.assertEquals("", filter); + assertEquals("", filter); } @Test @@ -175,54 +176,77 @@ public class HtmlUtilTest { // 去除的属性加双引号测试 String html = "
"; String result = HtmlUtil.removeHtmlAttr(html, "class"); - Assertions.assertEquals("
", result); + assertEquals("
", result); // 去除的属性后跟空格、加单引号、不加引号测试 html = "
"; result = HtmlUtil.removeHtmlAttr(html, "class"); - Assertions.assertEquals("
", result); + assertEquals("
", result); // 去除的属性位于标签末尾、其它属性前测试 html = "
"; result = HtmlUtil.removeHtmlAttr(html, "class"); - Assertions.assertEquals("
", result); + assertEquals("
", result); // 去除的属性名和值之间存在空格 html = "
"; result = HtmlUtil.removeHtmlAttr(html, "class"); - Assertions.assertEquals("
", result); + assertEquals("
", result); } @Test public void removeAllHtmlAttrTest() { final String html = "
"; final String result = HtmlUtil.removeAllHtmlAttr(html, "div"); - Assertions.assertEquals("
", result); + assertEquals("
", result); } @Test public void getCharsetTest() { String charsetName = ReUtil.get(ContentTypeUtil.CHARSET_PATTERN, "Charset=UTF-8;fq=0.9", 1); - Assertions.assertEquals("UTF-8", charsetName); + assertEquals("UTF-8", charsetName); charsetName = ReUtil.get(HtmlUtil.META_CHARSET_PATTERN, "hello world"; String cleanText = HtmlUtil.removeHtmlAttr(html,"class"); - Assertions.assertEquals("
hello world
", cleanText); + assertEquals("
hello world
", cleanText); html = "
hello world
"; cleanText = HtmlUtil.removeHtmlAttr(html,"class"); - Assertions.assertEquals("
hello world
", cleanText); + assertEquals("
hello world
", cleanText); + } + + @Test + public void cleanEmptyTagTest() { + String str = "

"; + String result = HtmlUtil.cleanEmptyTag(str); + assertEquals("", result); + + str = "

TEXT

"; + result = HtmlUtil.cleanEmptyTag(str); + assertEquals("

TEXT

", result); + + str = "

TEXT
"; + result = HtmlUtil.cleanEmptyTag(str); + assertEquals("
TEXT
", result); + + str = "

TEXT

TEXT
"; + result = HtmlUtil.cleanEmptyTag(str); + assertEquals("

TEXT

TEXT
", result); + + str = "TEXT

TEXT"; + result = HtmlUtil.cleanEmptyTag(str); + assertEquals("TEXTTEXT", result); } }