UTF-8 encoded files are divided into two types: with Bom and without Bom. The one with Bom is easy for everyone to process, while the one without Bom will be a bit troublesome, so I wrote a function to judge. The code is as follows :
//Return 1 means pure ASCII (that is, all characters are not greater than 127)
//Return 2 means UTF8
//Return 0 means normal gb encoding
function TestUtf8($text)
{
if(strlen($text) < 3) return false;
$lastch = 0;
$begin = 0;
$ BOM = true;
$BOMchs = array(0xEF, 0xBB, 0xBF);
$good = 0;
$bad = 0;
$notAscii = 0;
for($i =0; $i < strlen($text); $i++)
{
$ch = ord($text[$i]);
if($begin < 3)
{
$BOM = ($BOMchs[$begin]==$ch);
$begin += 1;
continue;
}
if($begin== 4 && $BOM) break;
if($ch >= 0x80 ) $notAscii++;
if( ($ch&0xC0) == 0x80 )
{
if( ($lastch&0xC0) == 0xC0 )
{
$good += 1;
}
else if( ($lastch&0x80) == 0 )
{
$bad += 1;
}
}
else if( ($lastch&0xC0) == 0xC0 )
{
$bad += 1;
}
$lastch = $ch;
}
if($begin == 4 && $BOM)
{
return 2;
}
else if($notAscii==0)
{
return 1;
}
else if ($good >= $bad )
{
return 2;
}
else
{
return 0;
}
}