首页 > 类库下载 > C#类库 > c#如何检测文本文件的编码

c#如何检测文本文件的编码

大家讲道理
发布: 2016-11-10 09:13:14
原创
1903 人浏览过

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

using System;

using System.Text;

using System.Text.RegularExpressions;

using System.IO;

  

namespace KlerksSoft

{

    public static class TextFileEncodingDetector

    {

        /*

* Simple class to handle text file encoding woes (in a primarily English-speaking tech

* world).

*

* - This code is fully managed, no shady calls to MLang (the unmanaged codepage

* detection library originally developed for Internet Explorer).

*

* - This class does NOT try to detect arbitrary codepages/charsets, it really only

* aims to differentiate between some of the most common variants of Unicode

* encoding, and a "default" (western / ascii-based) encoding alternative provided

* by the caller.

*

* - As there is no "Reliable" way to distinguish between UTF-8 (without BOM) and

* Windows-1252 (in .Net, also incorrectly called "ASCII") encodings, we use a

* heuristic - so the more of the file we can sample the better the guess. If you

* are going to read the whole file into memory at some point, then best to pass

* in the whole byte byte array directly. Otherwise, decide how to trade off

* reliability against performance / memory usage.

*

* - The UTF-8 detection heuristic only works for western text, as it relies on

* the presence of UTF-8 encoded accented and other characters found in the upper

* ranges of the Latin-1 and (particularly) Windows-1252 codepages.

*

* - For more general detection routines, see existing projects / resources:

* - MLang - Microsoft library originally for IE6, available in Windows XP and later APIs now (I think?)

* - MLang .Net bindings: http://www.codeproject.com/KB/recipes/DetectEncoding.aspx

* - CharDet - Mozilla browser's detection routines

* - Ported to Java then .Net: http://www.conceptdevelopment.net/Localization/NCharDet/

* - Ported straight to .Net: http://code.google.com/p/chardetsharp/source/browse

*

* Copyright Tao Klerks, Jan 2010, tao@klerks.biz

* Licensed under the modified BSD license:

*

  

Redistribution and use in source and binary forms, with or without modification, are

permitted provided that the following conditions are met:

  

- Redistributions of source code must retain the above copyright notice, this list of

conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright notice, this list

of conditions and the following disclaimer in the documentation and/or other materials

provided with the distribution.

- The name of the author may not be used to endorse or promote products derived from

this software without specific prior written permission.

  

THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,

INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY

DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY

OF SUCH DAMAGE.

  

*

*/

  

        

 const long _defaultHeuristicSampleSize = 0x10000; //completely

arbitrary - inappropriate for high numbers of files / high speed

requirements

  

        public static Encoding DetectTextFileEncoding(string InputFilename, Encoding DefaultEncoding)

        {

            using (FileStream textfileStream = File.OpenRead(InputFilename))

            {

                return DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize);

            }

        }

  

        

 public static Encoding DetectTextFileEncoding(FileStream

InputFileStream, Encoding DefaultEncoding, long HeuristicSampleSize)

        {

            if (InputFileStream == null)

                throw new ArgumentNullException("Must provide a valid Filestream!", "InputFileStream");

  

            if (!InputFileStream.CanRead)

                throw new ArgumentException("Provided file stream is not readable!", "InputFileStream");

  

            if (!InputFileStream.CanSeek)

                throw new ArgumentException("Provided file stream cannot seek!", "InputFileStream");

  

            Encoding encodingFound = null;

  

            long originalPos = InputFileStream.Position;

  

            InputFileStream.Position = 0;

  

            //First read only what we need for BOM detection

  

            byte[] bomBytes = new byte[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];

            InputFileStream.Read(bomBytes, 0, bomBytes.Length);

  

            encodingFound = DetectBOMBytes(bomBytes);

  

            if (encodingFound != null)

            {

                InputFileStream.Position = originalPos;

                return encodingFound;

            }

  

            //BOM Detection failed, going for heuristics now.

            // create sample byte array and populate it

            

 byte[] sampleBytes = new byte[HeuristicSampleSize >

InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];

            Array.Copy(bomBytes, sampleBytes, bomBytes.Length);

            if (InputFileStream.Length > bomBytes.Length)

                InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);

            InputFileStream.Position = originalPos;

  

            //test byte array content

            encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);

  

            if (encodingFound != null)

                return encodingFound;

            else

                return DefaultEncoding;

        }

  

        public static Encoding DetectTextByteArrayEncoding(byte[] TextData, Encoding DefaultEncoding)

        {

            if (TextData == null)

                throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");

  

            Encoding encodingFound = null;

  

            encodingFound = DetectBOMBytes(TextData);

  

            if (encodingFound != null)

            {

                return encodingFound;

            }

            else

            {

                //test byte array content

                encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);

  

                if (encodingFound != null)

                    return encodingFound;

                else

                    return DefaultEncoding;

            }

  

        }

  

        public static Encoding DetectBOMBytes(byte[] BOMBytes)

        {

            if (BOMBytes == null)

                throw new ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes");

  

            if (BOMBytes.Length < 2)

                return null;

  

            if (BOMBytes[0] == 0xff

                && BOMBytes[1] == 0xfe

                && (BOMBytes.Length < 4

                    || BOMBytes[2] != 0

                    || BOMBytes[3] != 0

                    )

                )

                return Encoding.Unicode;

  

            if (BOMBytes[0] == 0xfe

                && BOMBytes[1] == 0xff

                )

                return Encoding.BigEndianUnicode;

  

            if (BOMBytes.Length < 3)

                return null;

  

            if (BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)

                return Encoding.UTF8;

  

            if (BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)

                return Encoding.UTF7;

  

            if (BOMBytes.Length < 4)

                return null;

  

            if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)

                return Encoding.UTF32;

  

            if (BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)

                return Encoding.GetEncoding(12001);

  

            return null;

        }

  

        public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)

        {

            long oddBinaryNullsInSample = 0;

            long evenBinaryNullsInSample = 0;

            long suspiciousUTF8SequenceCount = 0;

            long suspiciousUTF8BytesTotal = 0;

            long likelyUSASCIIBytesInSample = 0;

  

            //Cycle through, keeping count of binary null positions, possible UTF-8

            // sequences from upper ranges of Windows-1252, and probable US-ASCII

            // character counts.

  

            long currentPos = 0;

            int skipUTF8Bytes = 0;

  

            while (currentPos < SampleBytes.Length)

            {

                //binary null distribution

                if (SampleBytes[currentPos] == 0)

                {

                    if (currentPos % 2 == 0)

                        evenBinaryNullsInSample++;

                    else

                        oddBinaryNullsInSample++;

                }

  

                //likely US-ASCII characters

                if (IsCommonUSASCIIByte(SampleBytes[currentPos]))

                    likelyUSASCIIBytesInSample++;

  

                //suspicious sequences (look like UTF-8)

                if (skipUTF8Bytes == 0)

                {

                    int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);

  

                    if (lengthFound > 0)

                    {

                        suspiciousUTF8SequenceCount++;

                        suspiciousUTF8BytesTotal += lengthFound;

                        skipUTF8Bytes = lengthFound - 1;

                    }

                }

                else

                {

                    skipUTF8Bytes--;

                }

  

                currentPos++;

            }

  

            //1: UTF-16 LE - in english / european environments, this is usually characterized by a

            // high proportion of odd binary nulls (starting at 0), with (as this is text) a low

            // proportion of even binary nulls.

            // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than

            // 60% nulls where you do expect nulls) are completely arbitrary.

  

            if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2

                && ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6

                )

                return Encoding.Unicode;

  

            //2: UTF-16 BE - in english / european environments, this is usually characterized by a

            // high proportion of even binary nulls (starting at 0), with (as this is text) a low

            // proportion of odd binary nulls.

            // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than

            // 60% nulls where you do expect nulls) are completely arbitrary.

  

            if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2

                && ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6

                )

                return Encoding.BigEndianUnicode;

  

            //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content

            // using regexp, in his w3c.org unicode FAQ entry:

            // http://www.w3.org/International/questions/qa-forms-utf-8

            // adapted here for C#.

            string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);

            Regex UTF8Validator = new Regex(@"\A("

                + @"[\x09\x0A\x0D\x20-\x7E]"

                + @"|[\xC2-\xDF][\x80-\xBF]"

                + @"|\xE0[\xA0-\xBF][\x80-\xBF]"

                + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"

                + @"|\xED[\x80-\x9F][\x80-\xBF]"

                + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"

                + @"|[\xF1-\xF3][\x80-\xBF]{3}"

                + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"

                + @")*\z");

            if (UTF8Validator.IsMatch(potentiallyMangledString))

            {

                //Unfortunately, just the fact that it CAN be UTF-8 doesn&#39;t tell you much about probabilities.

                

 //If all the characters are in the 0-127 range, no harm done, most

western charsets are same as UTF-8 in these ranges.

                //If some of the characters were in the upper range (western accented

characters), however, they would likely be mangled to 2-byte by the

UTF-8 encoding process.

                // So, we need to play stats.

  

                // The "Random" likelihood of any pair of randomly generated characters being one

                // of these "suspicious" character sequences is:

                // 128 / (256 * 256) = 0.2%.

                //

                // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127

                // character range, so we assume that more than 1 in 500,000 of these character

                // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.

                //

                // We can only assume these character sequences will be rare if we ALSO assume that this

                // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is

                // not already suspicious sequences) should be plain US-ASCII bytes. This, I

                // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield

                // approx 40%, so the chances of hitting this threshold by accident in random data are

                // VERY low).

  

                if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) //suspicious sequences

                    && (

                           //all suspicious, so cannot evaluate proportion of US-Ascii

                           SampleBytes.Length - suspiciousUTF8BytesTotal == 0

                           ||

                           likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8

                       )

                    )

                    return Encoding.UTF8;

            }

  

            return null;

        }

  

        private static bool IsCommonUSASCIIByte(byte testByte)

        {

            if (testByte == 0x0A //lf

                || testByte == 0x0D //cr

                || testByte == 0x09 //tab

                || (testByte >= 0x20 && testByte <= 0x2F) //common punctuation

                || (testByte >= 0x30 && testByte <= 0x39) //digits

                || (testByte >= 0x3A && testByte <= 0x40) //common punctuation

                || (testByte >= 0x41 && testByte <= 0x5A) //capital letters

                || (testByte >= 0x5B && testByte <= 0x60) //common punctuation

                || (testByte >= 0x61 && testByte <= 0x7A) //lowercase letters

                || (testByte >= 0x7B && testByte <= 0x7E) //common punctuation

                )

                return true;

            else

                return false;

        }

  

        private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)

        {

            int lengthFound = 0;

  

            if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC2

                )

            {

                if (SampleBytes[currentPos + 1] == 0x81

                    || SampleBytes[currentPos + 1] == 0x8D

                    || SampleBytes[currentPos + 1] == 0x8F

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] == 0x90

                    || SampleBytes[currentPos + 1] == 0x9D

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] >= 0xA0

                    && SampleBytes[currentPos + 1] <= 0xBF

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC3

                )

            {

                if (SampleBytes[currentPos + 1] >= 0x80

                    && SampleBytes[currentPos + 1] <= 0xBF

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC5

                )

            {

                if (SampleBytes[currentPos + 1] == 0x92

                    || SampleBytes[currentPos + 1] == 0x93

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] == 0xA0

                    || SampleBytes[currentPos + 1] == 0xA1

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] == 0xB8

                    || SampleBytes[currentPos + 1] == 0xBD

                    || SampleBytes[currentPos + 1] == 0xBE

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC6

                )

            {

                if (SampleBytes[currentPos + 1] == 0x92)

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xCB

                )

            {

                if (SampleBytes[currentPos + 1] == 0x86

                    || SampleBytes[currentPos + 1] == 0x9C

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 2

                && SampleBytes[currentPos] == 0xE2

                )

            {

                if (SampleBytes[currentPos + 1] == 0x80)

                {

                    if (SampleBytes[currentPos + 2] == 0x93

                        || SampleBytes[currentPos + 2] == 0x94

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0x98

                        || SampleBytes[currentPos + 2] == 0x99

                        || SampleBytes[currentPos + 2] == 0x9A

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0x9C

                        || SampleBytes[currentPos + 2] == 0x9D

                        || SampleBytes[currentPos + 2] == 0x9E

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xA0

                        || SampleBytes[currentPos + 2] == 0xA1

                        || SampleBytes[currentPos + 2] == 0xA2

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xA6)

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xB0)

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xB9

                        || SampleBytes[currentPos + 2] == 0xBA

                        )

                        lengthFound = 3;

                }

                else if (SampleBytes[currentPos + 1] == 0x82

                    && SampleBytes[currentPos + 2] == 0xAC

                    )

                    lengthFound = 3;

                else if (SampleBytes[currentPos + 1] == 0x84

                    && SampleBytes[currentPos + 2] == 0xA2

                    )

                    lengthFound = 3;

            }

  

            return lengthFound;

        }

  

    }

}

登录后复制

使用方法:

1

Encoding fileEncoding = TextFileEncodingDetector.DetectTextFileEncoding("you file path",Encoding.Default);

登录后复制


来源:php.cn
本站声明
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn
作者最新文章
热门教程
更多>
最新下载
更多>
网站特效
网站源码
网站素材
前端模板