104 lines
3.1 KiB
C#
104 lines
3.1 KiB
C#
using System.Text.RegularExpressions;
|
||
using System;
|
||
using System.Text;
|
||
|
||
public class UnicodeUtil {
|
||
public static string Convert(string unicodeString) {
|
||
if (string.IsNullOrEmpty(unicodeString))
|
||
return string.Empty;
|
||
|
||
string outStr = unicodeString;
|
||
|
||
Regex re = new Regex("\\\\u[0123456789abcdef]{4}", RegexOptions.IgnoreCase);
|
||
MatchCollection mc = re.Matches(unicodeString);
|
||
foreach (Match ma in mc) {
|
||
outStr = outStr.Replace(ma.Value, ConverUnicodeStringToChar(ma.Value).ToString());
|
||
}
|
||
return outStr;
|
||
}
|
||
|
||
private static char ConverUnicodeStringToChar(string str) {
|
||
char outStr = Char.MinValue;
|
||
outStr = (char)int.Parse(str.Remove(0, 2), System.Globalization.NumberStyles.HexNumber);
|
||
return outStr;
|
||
}
|
||
|
||
// UTF-8 BOM 字节序列
|
||
private static readonly byte[] Utf8BOM = new byte[] { 0xEF, 0xBB, 0xBF };
|
||
|
||
// 判断字节数组是否包含 UTF-8 BOM
|
||
public static bool HasUtf8BOM(byte[] bytes)
|
||
{
|
||
if (bytes.Length < 3)
|
||
{
|
||
return false; // 字节数组长度小于 3 不可能包含 UTF-8 BOM
|
||
}
|
||
|
||
// 检查前 3 个字节是否匹配 UTF-8 BOM
|
||
for (int i = 0; i < 3; i++)
|
||
{
|
||
if (bytes[i] != Utf8BOM[i])
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
// 删除字节数组中的 UTF-8 BOM
|
||
public static byte[] RemoveUtf8BOM(byte[] bytes)
|
||
{
|
||
if (HasUtf8BOM(bytes))
|
||
{
|
||
// 删除 BOM,返回去掉前 3 个字节的新字节数组
|
||
byte[] result = new byte[bytes.Length - 3];
|
||
Array.Copy(bytes, 3, result, 0, result.Length);
|
||
return result;
|
||
}
|
||
|
||
// 如果没有 BOM,直接返回原始字节数组
|
||
return bytes;
|
||
}
|
||
|
||
// 将带有 BOM 的字节数组转换为不带 BOM 的字节数组
|
||
public static byte[] ConvertToNonBOM(byte[] bytes)
|
||
{
|
||
return RemoveUtf8BOM(bytes);
|
||
}
|
||
|
||
private static bool AreArraysEqual(byte[] arr1, byte[] arr2)
|
||
{
|
||
if (arr1.Length != arr2.Length)
|
||
return false;
|
||
|
||
for (int i = 0; i < arr1.Length; i++)
|
||
{
|
||
if (arr1[i] != arr2[i])
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
public static string GetEncoding(byte[] bytes)
|
||
{
|
||
Encoding[] encodings = { Encoding.UTF8, Encoding.ASCII, Encoding.Unicode, Encoding.BigEndianUnicode, Encoding.GetEncoding("ISO-8859-1") };
|
||
|
||
foreach (var encoding in encodings)
|
||
{
|
||
try
|
||
{
|
||
string result = encoding.GetString(bytes);
|
||
byte[] testBytes = encoding.GetBytes(result);
|
||
if (AreArraysEqual(testBytes, bytes))
|
||
{
|
||
return encoding.EncodingName;
|
||
}
|
||
}
|
||
catch
|
||
{
|
||
// Ignore errors and continue to try other encodings
|
||
}
|
||
}
|
||
return "Unknown Encoding";
|
||
}
|
||
} |