Why is there
no full support for Unicode?
Set the encoding using BOM.
The status of the binary file should be given only
after checking 0x00 characters.
BOM is part of the Unicode standard. http://www.unicode.org/faq/utf_bom.html#bom4
Files with encoding greater than 8 bits without BOM
at the beginning can be immediately identified as binary.
My function in C#:
/// <summary>
/// </summary>
/// <param name="stream"></param>
/// <returns>null - binary</returns>
public static Encoding GetEncodingStream(Stream stream)
{
BinaryReader bin = new BinaryReader(stream);
byte[] bom = new byte[4];
bin.BaseStream.Seek(0, SeekOrigin.Begin);
bin.BaseStream.Read(bom, 0, bom.Length);
bin.BaseStream.Seek(0, SeekOrigin.Begin);
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2]
== 0xFE && bom[3] == 0xFF) {
return new UTF32Encoding(true, true); // UTF-32, big-endian
} else if (bom[0] == 0xFE && bom[1] == 0xFF) {
return new UnicodeEncoding(true, true); // UTF-16,
big-endian
} else if (bom[0] == 0xFF && bom[1] == 0xFE) {
if (bom[2] == 0x00 && bom[2] == 0x00) {
return new UTF32Encoding(false, true); // UTF-32,
little-endian
} else {
return new UnicodeEncoding(false, true); // UTF-16,
little-endian
}
} else if (bom[0] == 0xEF && bom[1] == 0xBB &&
bom[2] == 0xBF) {
return new UTF8Encoding(true);
} else {
bool binary = false;
long fsize = bin.BaseStream.Length;
if (fsize > 100000) {
fsize = 100000;
}
byte[] bts = new byte[fsize];
bin.BaseStream.Seek(0, SeekOrigin.Begin);
bin.BaseStream.Read(bts, 0, (int)fsize);
bin.BaseStream.Seek(0, SeekOrigin.Begin);
for (int x = 0; x < fsize; x++) {
if (bts[x] == 0) {
binary = true;
break;
}
}
if (binary) {
return null;
}
return Encoding.Default;
}
}