using
System;
using
System.Text;
using
System.Text.RegularExpressions;
using
System.IO;
namespace
KlerksSoft
{
public
static
class
TextFileEncodingDetector
{
const
long
_defaultHeuristicSampleSize = 0x10000;
arbitrary - inappropriate
for
high numbers of files / high speed
requirements
public
static
Encoding DetectTextFileEncoding(
string
InputFilename, Encoding DefaultEncoding)
{
using
(FileStream textfileStream = File.OpenRead(InputFilename))
{
return
DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize);
}
}
public
static
Encoding DetectTextFileEncoding(FileStream
InputFileStream, Encoding DefaultEncoding,
long
HeuristicSampleSize)
{
if
(InputFileStream ==
null
)
throw
new
ArgumentNullException(
"Must provide a valid Filestream!"
,
"InputFileStream"
);
if
(!InputFileStream.CanRead)
throw
new
ArgumentException(
"Provided file stream is not readable!"
,
"InputFileStream"
);
if
(!InputFileStream.CanSeek)
throw
new
ArgumentException(
"Provided file stream cannot seek!"
,
"InputFileStream"
);
Encoding encodingFound =
null
;
long
originalPos = InputFileStream.Position;
InputFileStream.Position = 0;
byte
[] bomBytes =
new
byte
[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];
InputFileStream.Read(bomBytes, 0, bomBytes.Length);
encodingFound = DetectBOMBytes(bomBytes);
if
(encodingFound !=
null
)
{
InputFileStream.Position = originalPos;
return
encodingFound;
}
byte
[] sampleBytes =
new
byte
[HeuristicSampleSize >
InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];
Array.Copy(bomBytes, sampleBytes, bomBytes.Length);
if
(InputFileStream.Length > bomBytes.Length)
InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);
InputFileStream.Position = originalPos;
encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);
if
(encodingFound !=
null
)
return
encodingFound;
else
return
DefaultEncoding;
}
public
static
Encoding DetectTextByteArrayEncoding(
byte
[] TextData, Encoding DefaultEncoding)
{
if
(TextData ==
null
)
throw
new
ArgumentNullException(
"Must provide a valid text data byte array!"
,
"TextData"
);
Encoding encodingFound =
null
;
encodingFound = DetectBOMBytes(TextData);
if
(encodingFound !=
null
)
{
return
encodingFound;
}
else
{
encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);
if
(encodingFound !=
null
)
return
encodingFound;
else
return
DefaultEncoding;
}
}
public
static
Encoding DetectBOMBytes(
byte
[] BOMBytes)
{
if
(BOMBytes ==
null
)
throw
new
ArgumentNullException(
"Must provide a valid BOM byte array!"
,
"BOMBytes"
);
if
(BOMBytes.Length < 2)
return
null
;
if
(BOMBytes[0] == 0xff
&& BOMBytes[1] == 0xfe
&& (BOMBytes.Length < 4
|| BOMBytes[2] != 0
|| BOMBytes[3] != 0
)
)
return
Encoding.Unicode;
if
(BOMBytes[0] == 0xfe
&& BOMBytes[1] == 0xff
)
return
Encoding.BigEndianUnicode;
if
(BOMBytes.Length < 3)
return
null
;
if
(BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)
return
Encoding.UTF8;
if
(BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)
return
Encoding.UTF7;
if
(BOMBytes.Length < 4)
return
null
;
if
(BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)
return
Encoding.UTF32;
if
(BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)
return
Encoding.GetEncoding(12001);
return
null
;
}
public
static
Encoding DetectUnicodeInByteSampleByHeuristics(
byte
[] SampleBytes)
{
long
oddBinaryNullsInSample = 0;
long
evenBinaryNullsInSample = 0;
long
suspiciousUTF8SequenceCount = 0;
long
suspiciousUTF8BytesTotal = 0;
long
likelyUSASCIIBytesInSample = 0;
long
currentPos = 0;
int
skipUTF8Bytes = 0;
while
(currentPos < SampleBytes.Length)
{
if
(SampleBytes[currentPos] == 0)
{
if
(currentPos % 2 == 0)
evenBinaryNullsInSample++;
else
oddBinaryNullsInSample++;
}
if
(IsCommonUSASCIIByte(SampleBytes[currentPos]))
likelyUSASCIIBytesInSample++;
if
(skipUTF8Bytes == 0)
{
int
lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);
if
(lengthFound > 0)
{
suspiciousUTF8SequenceCount++;
suspiciousUTF8BytesTotal += lengthFound;
skipUTF8Bytes = lengthFound - 1;
}
}
else
{
skipUTF8Bytes--;
}
currentPos++;
}
if
(((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
&& ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
)
return
Encoding.Unicode;
if
(((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
&& ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
)
return
Encoding.BigEndianUnicode;
string
potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
Regex UTF8Validator =
new
Regex(
@"\A("
+
@"[\x09\x0A\x0D\x20-\x7E]"
+
@"|[\xC2-\xDF][\x80-\xBF]"
+
@"|\xE0[\xA0-\xBF][\x80-\xBF]"
+
@"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
+
@"|\xED[\x80-\x9F][\x80-\xBF]"
+
@"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
+
@"|[\xF1-\xF3][\x80-\xBF]{3}"
+
@"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
+
@")*\z"
);
if
(UTF8Validator.IsMatch(potentiallyMangledString))
{
western charsets are same
as
UTF-8
in
these ranges.
characters), however, they would likely be mangled to 2-
byte
by
the
UTF-8 encoding process.
if
((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1)
&& (
SampleBytes.Length - suspiciousUTF8BytesTotal == 0
||
likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
)
)
return
Encoding.UTF8;
}
return
null
;
}
private
static
bool
IsCommonUSASCIIByte(
byte
testByte)
{
if
(testByte == 0x0A
|| testByte == 0x0D
|| testByte == 0x09
|| (testByte >= 0x20 && testByte <= 0x2F)
|| (testByte >= 0x30 && testByte <= 0x39)
|| (testByte >= 0x3A && testByte <= 0x40)
|| (testByte >= 0x41 && testByte <= 0x5A)
|| (testByte >= 0x5B && testByte <= 0x60)
|| (testByte >= 0x61 && testByte <= 0x7A)
|| (testByte >= 0x7B && testByte <= 0x7E)
)
return
true
;
else
return
false
;
}
private
static
int
DetectSuspiciousUTF8SequenceLength(
byte
[] SampleBytes,
long
currentPos)
{
int
lengthFound = 0;
if
(SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC2
)
{
if
(SampleBytes[currentPos + 1] == 0x81
|| SampleBytes[currentPos + 1] == 0x8D
|| SampleBytes[currentPos + 1] == 0x8F
)
lengthFound = 2;
else
if
(SampleBytes[currentPos + 1] == 0x90
|| SampleBytes[currentPos + 1] == 0x9D
)
lengthFound = 2;
else
if
(SampleBytes[currentPos + 1] >= 0xA0
&& SampleBytes[currentPos + 1] <= 0xBF
)
lengthFound = 2;
}
else
if
(SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC3
)
{
if
(SampleBytes[currentPos + 1] >= 0x80
&& SampleBytes[currentPos + 1] <= 0xBF
)
lengthFound = 2;
}
else
if
(SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC5
)
{
if
(SampleBytes[currentPos + 1] == 0x92
|| SampleBytes[currentPos + 1] == 0x93
)
lengthFound = 2;
else
if
(SampleBytes[currentPos + 1] == 0xA0
|| SampleBytes[currentPos + 1] == 0xA1
)
lengthFound = 2;
else
if
(SampleBytes[currentPos + 1] == 0xB8
|| SampleBytes[currentPos + 1] == 0xBD
|| SampleBytes[currentPos + 1] == 0xBE
)
lengthFound = 2;
}
else
if
(SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC6
)
{
if
(SampleBytes[currentPos + 1] == 0x92)
lengthFound = 2;
}
else
if
(SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xCB
)
{
if
(SampleBytes[currentPos + 1] == 0x86
|| SampleBytes[currentPos + 1] == 0x9C
)
lengthFound = 2;
}
else
if
(SampleBytes.Length >= currentPos + 2
&& SampleBytes[currentPos] == 0xE2
)
{
if
(SampleBytes[currentPos + 1] == 0x80)
{
if
(SampleBytes[currentPos + 2] == 0x93
|| SampleBytes[currentPos + 2] == 0x94
)
lengthFound = 3;
if
(SampleBytes[currentPos + 2] == 0x98
|| SampleBytes[currentPos + 2] == 0x99
|| SampleBytes[currentPos + 2] == 0x9A
)
lengthFound = 3;
if
(SampleBytes[currentPos + 2] == 0x9C
|| SampleBytes[currentPos + 2] == 0x9D
|| SampleBytes[currentPos + 2] == 0x9E
)
lengthFound = 3;
if
(SampleBytes[currentPos + 2] == 0xA0
|| SampleBytes[currentPos + 2] == 0xA1
|| SampleBytes[currentPos + 2] == 0xA2
)
lengthFound = 3;
if
(SampleBytes[currentPos + 2] == 0xA6)
lengthFound = 3;
if
(SampleBytes[currentPos + 2] == 0xB0)
lengthFound = 3;
if
(SampleBytes[currentPos + 2] == 0xB9
|| SampleBytes[currentPos + 2] == 0xBA
)
lengthFound = 3;
}
else
if
(SampleBytes[currentPos + 1] == 0x82
&& SampleBytes[currentPos + 2] == 0xAC
)
lengthFound = 3;
else
if
(SampleBytes[currentPos + 1] == 0x84
&& SampleBytes[currentPos + 2] == 0xA2
)
lengthFound = 3;
}
return
lengthFound;
}
}
}