add ByteOrderMark

This commit is contained in:
Looly 2023-03-29 13:42:26 +08:00
parent 41cb8a6db7
commit 27e1f5f61e
6 changed files with 225 additions and 19 deletions

View File

@ -0,0 +1,189 @@
package cn.hutool.core.io;
import cn.hutool.core.lang.Assert;
import cn.hutool.core.util.ArrayUtil;
import cn.hutool.core.util.CharsetUtil;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Locale;
import java.util.function.Predicate;
/**
* Byte Order Mark (BOM) 头描述<br>
* BOM定义<a href="http://www.unicode.org/unicode/faq/utf_bom.html">http://www.unicode.org/unicode/faq/utf_bom.html</a>
* <ul>
* <li>EF BB BF = UTF-8</li>
* <li>FE FF = UTF-16BE, big-endian</li>
* <li>FF FE = UTF-16LE, little-endian</li>
* <li>00 00 FE FF = UTF-32BE, big-endian</li>
* <li>FF FE 00 00 = UTF-32LE, little-endian</li>
* </ul>
*
* <p>来自Apache-commons-io</p>
*
* @author Apache-commons-io
*/
public class ByteOrderMark implements Predicate<byte[]>, Comparable<ByteOrderMark>, Serializable {
private static final long serialVersionUID = 1L;
// region ----- BOMs
/**
* UTF-8 BOM.
*/
public static final ByteOrderMark UTF_8 = new ByteOrderMark(CharsetUtil.NAME_UTF_8, 0xEF, 0xBB, 0xBF);
/**
* UTF-16BE BOM (Big-Endian).
*/
public static final ByteOrderMark UTF_16BE = new ByteOrderMark("UTF-16BE", 0xFE, 0xFF);
/**
* UTF-16LE BOM (Little-Endian).
*/
public static final ByteOrderMark UTF_16LE = new ByteOrderMark("UTF-16LE", 0xFF, 0xFE);
/**
* UTF-32BE BOM (Big-Endian).
*/
public static final ByteOrderMark UTF_32BE = new ByteOrderMark("UTF-32BE", 0x00, 0x00, 0xFE, 0xFF);
/**
* UTF-32LE BOM (Little-Endian).
*/
public static final ByteOrderMark UTF_32LE = new ByteOrderMark("UTF-32LE", 0xFF, 0xFE, 0x00, 0x00);
/**
* 预定义的所有BOM信息
*/
public static final ByteOrderMark[] ALL = new ByteOrderMark[]{
UTF_32BE,
UTF_32LE,
UTF_8,
UTF_16BE,
UTF_16LE
};
// endregion
private final String charsetName;
private final int[] bytes;
/**
* 构造
*
* @param charsetName BOM定义的编码名称
* @param bytes BOM bytes
* @throws IllegalArgumentException 编码名称为空或者bytes为空
*/
public ByteOrderMark(final String charsetName, final int... bytes) {
if (ArrayUtil.isEmpty(bytes)) {
throw new IllegalArgumentException("No bytes specified");
}
this.charsetName = Assert.notEmpty(charsetName, "No charsetName specified");
this.bytes = new int[bytes.length];
System.arraycopy(bytes, 0, this.bytes, 0, bytes.length);
}
/**
* 获取BOM头定义的编码名称.
*
* @return 编码名称
*/
public String getCharsetName() {
return charsetName;
}
/**
* 获取BOM头byte数
*
* @return BOM头byte数
*/
public int length() {
return bytes.length;
}
/**
* 获取指定位置的byte值
*
* @param pos The position
* @return The specified byte
*/
public int get(final int pos) {
return bytes[pos];
}
/**
* Gets a copy of the BOM's bytes.
*
* @return a copy of the BOM's bytes
*/
public byte[] getBytes() {
final byte[] copy = new byte[bytes.length];
for (int i = 0; i < bytes.length; i++) {
copy[i] = (byte) bytes[i];
}
return copy;
}
/**
* 是否匹配头部BOM信息<br>
* 当提供的长度小于BOM需要检查的长度时返回{code false}
*
* @param headBytes 头部bytes
* @return 是否匹配头部BOM信息
*/
@Override
public boolean test(final byte[] headBytes) {
if (headBytes.length < bytes.length) {
return false;
}
for (int i = 0; i < bytes.length; i++) {
if (bytes[i] != headBytes[i]) {
return false;
}
}
return true;
}
@Override
public boolean equals(final Object obj) {
if (!(obj instanceof ByteOrderMark)) {
return false;
}
final ByteOrderMark bom = (ByteOrderMark) obj;
return Arrays.equals(this.bytes, bom.bytes);
}
@Override
public int hashCode() {
int hashCode = getClass().hashCode();
for (final int b : bytes) {
hashCode += b;
}
return hashCode;
}
@Override
public String toString() {
final StringBuilder builder = new StringBuilder();
builder.append(getClass().getSimpleName());
builder.append('[');
builder.append(charsetName);
builder.append(": ");
for (int i = 0; i < bytes.length; i++) {
if (i > 0) {
builder.append(",");
}
builder.append("0x");
builder.append(Integer.toHexString(0xFF & bytes[i]).toUpperCase(Locale.ROOT));
}
builder.append(']');
return builder.toString();
}
@Override
public int compareTo(final ByteOrderMark o) {
// 按照长度倒序
return Integer.compare(o.length(), this.length());
}
}

View File

@ -13,6 +13,7 @@
package cn.hutool.core.io.file;
import cn.hutool.core.util.ArrayUtil;
import cn.hutool.core.util.CharsetUtil;
import java.math.BigInteger;
import java.util.Arrays;
@ -82,8 +83,7 @@ public enum FileMagicNumber {
try {
final int dataLength = new BigInteger(1, Arrays.copyOfRange(bytes, i, i + 4)).intValue();
i += 4;
final byte[] bytes1 = Arrays.copyOfRange(bytes, i, i + 4);
final String chunkType = new String(bytes1);
final String chunkType = new String(bytes, i, 4, CharsetUtil.ISO_8859_1);
i += 4;
if (Objects.equals(chunkType, "IDAT") || Objects.equals(chunkType, "IEND")) {
return false;
@ -606,6 +606,7 @@ public enum FileMagicNumber {
//去除bom头并且跳过三个字节
if (bytes.length > 3 && Objects.equals(bytes[0], (byte) 0xEF)
&& Objects.equals(bytes[1], (byte) 0xBB) && Objects.equals(bytes[2], (byte) 0xBF)) {
// UTF8 Bom
bytes = Arrays.copyOfRange(bytes, 3, bytes.length);
}
return bytes.length > 3

View File

@ -12,6 +12,7 @@
package cn.hutool.core.io.stream;
import cn.hutool.core.io.ByteOrderMark;
import cn.hutool.core.io.IORuntimeException;
import cn.hutool.core.util.CharsetUtil;
@ -121,25 +122,17 @@ public class BOMInputStream extends InputStream {
final byte[] bom = new byte[BOM_SIZE];
final int n;
final int unread;
int unread = 0;
n = in.read(bom, 0, bom.length);
if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
charset = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
charset = "UTF-32LE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
charset = "UTF-8";
unread = n - 3;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
charset = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
charset = "UTF-16LE";
unread = n - 2;
} else {
for (final ByteOrderMark byteOrderMark : ByteOrderMark.ALL) {
if(byteOrderMark.test(bom)){
charset = byteOrderMark.getCharsetName();
unread = n - byteOrderMark.length();
break;
}
}
if(0 == unread) {
// Unicode BOM mark not found, unread all bytes
charset = defaultCharset;
unread = n;

View File

@ -42,6 +42,7 @@ public class ObjUtil {
* <p>比较两个对象是否相等满足下述任意条件即返回{@code true}
* <ul>
* <li>若两对象皆为{@link BigDecimal}且满足{@code 0 == obj1.compareTo(obj2)}</li>
* <li>若两对象都为数组调用Arrays.equals完成判断</li>
* <li>{@code obj1 == null && obj2 == null}</li>
* <li>{@code obj1.equals(obj2)}</li>
* </ul>
@ -54,6 +55,8 @@ public class ObjUtil {
public static boolean equals(final Object obj1, final Object obj2) {
if (obj1 instanceof BigDecimal && obj2 instanceof BigDecimal) {
return NumberUtil.equals((BigDecimal) obj1, (BigDecimal) obj2);
} else if(ArrayUtil.isArray(obj1) && ArrayUtil.isArray(obj2)){
return ArrayUtil.equals(obj1, obj2);
}
return Objects.equals(obj1, obj2);
}

View File

@ -3387,4 +3387,9 @@ public class PrimitiveArrayUtil {
return true;
}
// endregion
// region rangeMatches
public static boolean rangeMatches(final byte[] bytes1){
return false;
}
}

View File

@ -719,4 +719,19 @@ public class ArrayUtilTest {
b = ArrayUtil.startWith((int[])null, null);
Assert.assertTrue(b);
}
@Test
public void equalsTest() {
final boolean b = ObjUtil.equals(new int[]{1, 2, 3}, new int[]{1, 2, 3});
Assert.assertTrue(b);
}
@Test
public void copyOfRangeTest() {
String a = "aIDAT";
final byte[] bytes1 = Arrays.copyOfRange(a.getBytes(CharsetUtil.UTF_8), 1, 1 + 4);
Assert.assertEquals(new String(bytes1),
new String(a.getBytes(CharsetUtil.UTF_8), 1, 4));
}
}