GNU bug report logs -
#31185
Why is there no full support for Unicode?
Previous Next
Full log
View this message in rfc822 format
[Message part 1 (text/plain, inline)]
Why is there no full support for Unicode?
Set the encoding using BOM.
The status of the binary file should be given only after checking 0x00
characters.
BOM is part of the Unicode standard.
http://www.unicode.org/faq/utf_bom.html#bom4
Files with encoding greater than 8 bits without BOM at the beginning can
be immediately identified as binary.
My function in C#:
/// <summary>
/// </summary>
/// <param name="stream"></param>
/// <returns>null - binary</returns>
public static Encoding GetEncodingStream(Stream stream)
{
BinaryReader bin = new BinaryReader(stream);
byte[] bom = new byte[4];
bin.BaseStream.Seek(0, SeekOrigin.Begin);
bin.BaseStream.Read(bom, 0, bom.Length);
bin.BaseStream.Seek(0, SeekOrigin.Begin);
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFE && bom[3] ==
0xFF) {
return new UTF32Encoding(true, true); // UTF-32, big-endian
} else if (bom[0] == 0xFE && bom[1] == 0xFF) {
return new UnicodeEncoding(true, true); // UTF-16, big-endian
} else if (bom[0] == 0xFF && bom[1] == 0xFE) {
if (bom[2] == 0x00 && bom[2] == 0x00) {
return new UTF32Encoding(false, true); // UTF-32, little-endian
} else {
return new UnicodeEncoding(false, true); // UTF-16,
little-endian
}
} else if (bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF) {
return new UTF8Encoding(true);
} else {
bool binary = false;
long fsize = bin.BaseStream.Length;
if (fsize > 100000) {
fsize = 100000;
}
byte[] bts = new byte[fsize];
bin.BaseStream.Seek(0, SeekOrigin.Begin);
bin.BaseStream.Read(bts, 0, (int)fsize);
bin.BaseStream.Seek(0, SeekOrigin.Begin);
for (int x = 0; x < fsize; x++) {
if (bts[x] == 0) {
binary = true;
break;
}
}
if (binary) {
return null;
}
return Encoding.Default;
}
}
[Message part 2 (text/html, inline)]
This bug report was last modified 7 years and 62 days ago.
Previous Next
GNU bug tracking system
Copyright (C) 1999 Darren O. Benham,
1997,2003 nCipher Corporation Ltd,
1994-97 Ian Jackson.