add ByteOrderMark

2025-05-09 23:51:34 +08:00 · 2023-03-29 13:42:26 +08:00 · 2023-03-29 13:42:26 +08:00 · 27e1f5f61e
commit 27e1f5f61e
parent 41cb8a6db7
6 changed files with 225 additions and 19 deletions
--- a/hutool-core/src/main/java/cn/hutool/core/io/ByteOrderMark.java
+++ b/hutool-core/src/main/java/cn/hutool/core/io/ByteOrderMark.java
@ -0,0 +1,189 @@
+package cn.hutool.core.io;
+
+import cn.hutool.core.lang.Assert;
+import cn.hutool.core.util.ArrayUtil;
+import cn.hutool.core.util.CharsetUtil;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Locale;
+import java.util.function.Predicate;
+
+/**
+ * Byte Order Mark (BOM) 头描述<br>
+ * BOM定义：<a href="http://www.unicode.org/unicode/faq/utf_bom.html">http://www.unicode.org/unicode/faq/utf_bom.html</a>
+ * <ul>
+ * 	<li>EF BB BF = UTF-8</li>
+ * 	<li>FE FF = UTF-16BE, big-endian</li>
+ * 	<li>FF FE =  UTF-16LE, little-endian</li>
+ * 	<li>00 00 FE FF = UTF-32BE, big-endian</li>
+ * 	<li>FF FE 00 00 = UTF-32LE, little-endian</li>
+ * </ul>
+ *
+ * <p>来自：Apache-commons-io</p>
+ *
+ * @author Apache-commons-io
+ */
+public class ByteOrderMark implements Predicate<byte[]>, Comparable<ByteOrderMark>, Serializable {
+	private static final long serialVersionUID = 1L;
+
+	// region ----- BOMs
+	/**
+	 * UTF-8 BOM.
+	 */
+	public static final ByteOrderMark UTF_8 = new ByteOrderMark(CharsetUtil.NAME_UTF_8, 0xEF, 0xBB, 0xBF);
+
+	/**
+	 * UTF-16BE BOM (Big-Endian).
+	 */
+	public static final ByteOrderMark UTF_16BE = new ByteOrderMark("UTF-16BE", 0xFE, 0xFF);
+
+	/**
+	 * UTF-16LE BOM (Little-Endian).
+	 */
+	public static final ByteOrderMark UTF_16LE = new ByteOrderMark("UTF-16LE", 0xFF, 0xFE);
+
+	/**
+	 * UTF-32BE BOM (Big-Endian).
+	 */
+	public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);
+
+	/**
+	 * UTF-32LE BOM (Little-Endian).
+	 */
+	public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);
+
+	/**
+	 * 预定义的所有BOM信息
+	 */
+	public static final ByteOrderMark[] ALL = new ByteOrderMark[]{
+			UTF_32BE,
+			UTF_32LE,
+			UTF_8,
+			UTF_16BE,
+			UTF_16LE
+	};
+	// endregion
+
+	private final String charsetName;
+	private final int[] bytes;
+
+	/**
+	 * 构造
+	 *
+	 * @param charsetName BOM定义的编码名称
+	 * @param bytes       BOM bytes
+	 * @throws IllegalArgumentException 编码名称为空或者bytes为空
+	 */
+	public ByteOrderMark(final String charsetName, final int... bytes) {
+		if (ArrayUtil.isEmpty(bytes)) {
+			throw new IllegalArgumentException("No bytes specified");
+		}
+		this.charsetName = Assert.notEmpty(charsetName, "No charsetName specified");
+		this.bytes = new int[bytes.length];
+		System.arraycopy(bytes, 0, this.bytes, 0, bytes.length);
+	}
+
+	/**
+	 * 获取BOM头定义的编码名称.
+	 *
+	 * @return 编码名称
+	 */
+	public String getCharsetName() {
+		return charsetName;
+	}
+
+	/**
+	 * 获取BOM头byte数
+	 *
+	 * @return BOM头byte数
+	 */
+	public int length() {
+		return bytes.length;
+	}
+
+	/**
+	 * 获取指定位置的byte值
+	 *
+	 * @param pos The position
+	 * @return The specified byte
+	 */
+	public int get(final int pos) {
+		return bytes[pos];
+	}
+
+	/**
+	 * Gets a copy of the BOM's bytes.
+	 *
+	 * @return a copy of the BOM's bytes
+	 */
+	public byte[] getBytes() {
+		final byte[] copy = new byte[bytes.length];
+		for (int i = 0; i < bytes.length; i++) {
+			copy[i] = (byte) bytes[i];
+		}
+		return copy;
+	}
+
+	/**
+	 * 是否匹配头部BOM信息<br>
+	 * 当提供的长度小于BOM需要检查的长度时，返回{code false}
+	 *
+	 * @param headBytes 头部bytes
+	 * @return 是否匹配头部BOM信息
+	 */
+	@Override
+	public boolean test(final byte[] headBytes) {
+		if (headBytes.length < bytes.length) {
+			return false;
+		}
+		for (int i = 0; i < bytes.length; i++) {
+			if (bytes[i] != headBytes[i]) {
+				return false;
+			}
+		}
+		return true;
+	}
+
+	@Override
+	public boolean equals(final Object obj) {
+		if (!(obj instanceof ByteOrderMark)) {
+			return false;
+		}
+		final ByteOrderMark bom = (ByteOrderMark) obj;
+		return Arrays.equals(this.bytes, bom.bytes);
+	}
+
+	@Override
+	public int hashCode() {
+		int hashCode = getClass().hashCode();
+		for (final int b : bytes) {
+			hashCode += b;
+		}
+		return hashCode;
+	}
+
+	@Override
+	public String toString() {
+		final StringBuilder builder = new StringBuilder();
+		builder.append(getClass().getSimpleName());
+		builder.append('[');
+		builder.append(charsetName);
+		builder.append(": ");
+		for (int i = 0; i < bytes.length; i++) {
+			if (i > 0) {
+				builder.append(",");
+			}
+			builder.append("0x");
+			builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
+		}
+		builder.append(']');
+		return builder.toString();
+	}
+
+	@Override
+	public int compareTo(final ByteOrderMark o) {
+		// 按照长度倒序
+		return Integer.compare(o.length(), this.length());
+	}
+}
--- a/hutool-core/src/main/java/cn/hutool/core/io/file/FileMagicNumber.java
+++ b/hutool-core/src/main/java/cn/hutool/core/io/file/FileMagicNumber.java
@ -13,6 +13,7 @@
 package cn.hutool.core.io.file;

 import cn.hutool.core.util.ArrayUtil;
+import cn.hutool.core.util.CharsetUtil;

 import java.math.BigInteger;
 import java.util.Arrays;
@ -82,8 +83,7 @@ public enum FileMagicNumber {
 					try {
 						final int dataLength = new BigInteger(1, Arrays.copyOfRange(bytes, i, i + 4)).intValue();
 						i += 4;
-						final byte[] bytes1 = Arrays.copyOfRange(bytes, i, i + 4);
-						final String chunkType = new String(bytes1);
+						final String chunkType = new String(bytes, i, 4, CharsetUtil.ISO_8859_1);
 						i += 4;
 						if (Objects.equals(chunkType, "IDAT") || Objects.equals(chunkType, "IEND")) {
 							return false;
@ -606,6 +606,7 @@ public enum FileMagicNumber {
 			//去除bom头并且跳过三个字节
 			if (bytes.length > 3 && Objects.equals(bytes[0], (byte) 0xEF)
 					&& Objects.equals(bytes[1], (byte) 0xBB) && Objects.equals(bytes[2], (byte) 0xBF)) {
+				// UTF8 Bom
 				bytes = Arrays.copyOfRange(bytes, 3, bytes.length);
 			}
 			return bytes.length > 3
--- a/hutool-core/src/main/java/cn/hutool/core/io/stream/BOMInputStream.java
+++ b/hutool-core/src/main/java/cn/hutool/core/io/stream/BOMInputStream.java
@ -12,6 +12,7 @@

 package cn.hutool.core.io.stream;

+import cn.hutool.core.io.ByteOrderMark;
 import cn.hutool.core.io.IORuntimeException;
 import cn.hutool.core.util.CharsetUtil;

@ -121,25 +122,17 @@ public class BOMInputStream extends InputStream {

 		final byte[] bom = new byte[BOM_SIZE];
 		final int n;
-		final int unread;
+		int unread = 0;
 		n = in.read(bom, 0, bom.length);

-		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
-			charset = "UTF-32BE";
-			unread = n - 4;
-		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
-			charset = "UTF-32LE";
-			unread = n - 4;
-		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
-			charset = "UTF-8";
-			unread = n - 3;
-		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
-			charset = "UTF-16BE";
-			unread = n - 2;
-		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
-			charset = "UTF-16LE";
-			unread = n - 2;
-		} else {
+		for (final ByteOrderMark byteOrderMark : ByteOrderMark.ALL) {
+			if(byteOrderMark.test(bom)){
+				charset = byteOrderMark.getCharsetName();
+				unread = n - byteOrderMark.length();
+				break;
+			}
+		}
+		 if(0 == unread) {
 			// Unicode BOM mark not found, unread all bytes
 			charset = defaultCharset;
 			unread = n;
--- a/hutool-core/src/main/java/cn/hutool/core/util/ObjUtil.java
+++ b/hutool-core/src/main/java/cn/hutool/core/util/ObjUtil.java
@ -42,6 +42,7 @@ public class ObjUtil {
 	 * <p>比较两个对象是否相等，满足下述任意条件即返回{@code true}：
 	 * <ul>
 	 *     <li>若两对象皆为{@link BigDecimal}，且满足{@code 0 == obj1.compareTo(obj2)}</li>
+	 *     <li>若两对象都为数组，调用Arrays.equals完成判断</li>
 	 *     <li>{@code obj1 == null && obj2 == null}</li>
 	 *     <li>{@code obj1.equals(obj2)}</li>
 	 * </ul>
@ -54,6 +55,8 @@ public class ObjUtil {
 	public static boolean equals(final Object obj1, final Object obj2) {
 		if (obj1 instanceof BigDecimal && obj2 instanceof BigDecimal) {
 			return NumberUtil.equals((BigDecimal) obj1, (BigDecimal) obj2);
+		} else if(ArrayUtil.isArray(obj1) && ArrayUtil.isArray(obj2)){
+			return ArrayUtil.equals(obj1, obj2);
 		}
 		return Objects.equals(obj1, obj2);
 	}
--- a/hutool-core/src/main/java/cn/hutool/core/util/PrimitiveArrayUtil.java
+++ b/hutool-core/src/main/java/cn/hutool/core/util/PrimitiveArrayUtil.java
@ -3387,4 +3387,9 @@ public class PrimitiveArrayUtil {
 		return true;
 	}
 	// endregion
+
+	// region rangeMatches
+	public static boolean rangeMatches(final byte[] bytes1){
+		return false;
+	}
 }
--- a/hutool-core/src/test/java/cn/hutool/core/util/ArrayUtilTest.java
+++ b/hutool-core/src/test/java/cn/hutool/core/util/ArrayUtilTest.java
@ -719,4 +719,19 @@ public class ArrayUtilTest {
 		b = ArrayUtil.startWith((int[])null, null);
 		Assert.assertTrue(b);
 	}
+
+	@Test
+	public void equalsTest() {
+		final boolean b = ObjUtil.equals(new int[]{1, 2, 3}, new int[]{1, 2, 3});
+		Assert.assertTrue(b);
+	}
+
+	@Test
+	public void copyOfRangeTest() {
+		String a = "aIDAT";
+		final byte[] bytes1 = Arrays.copyOfRange(a.getBytes(CharsetUtil.UTF_8), 1, 1 + 4);
+
+		Assert.assertEquals(new String(bytes1),
+				new String(a.getBytes(CharsetUtil.UTF_8), 1, 4));
+	}
 }