首頁 类库下载 C#類別庫 c#如何偵測文字檔案的編碼

c#如何偵測文字檔案的編碼

Nov 10, 2016 am 09:13 AM

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

using System;

using System.Text;

using System.Text.RegularExpressions;

using System.IO;

  

namespace KlerksSoft

{

    public static class TextFileEncodingDetector

    {

        /*

* Simple class to handle text file encoding woes (in a primarily English-speaking tech

* world).

*

* - This code is fully managed, no shady calls to MLang (the unmanaged codepage

* detection library originally developed for Internet Explorer).

*

* - This class does NOT try to detect arbitrary codepages/charsets, it really only

* aims to differentiate between some of the most common variants of Unicode

* encoding, and a "default" (western / ascii-based) encoding alternative provided

* by the caller.

*

* - As there is no "Reliable" way to distinguish between UTF-8 (without BOM) and

* Windows-1252 (in .Net, also incorrectly called "ASCII") encodings, we use a

* heuristic - so the more of the file we can sample the better the guess. If you

* are going to read the whole file into memory at some point, then best to pass

* in the whole byte byte array directly. Otherwise, decide how to trade off

* reliability against performance / memory usage.

*

* - The UTF-8 detection heuristic only works for western text, as it relies on

* the presence of UTF-8 encoded accented and other characters found in the upper

* ranges of the Latin-1 and (particularly) Windows-1252 codepages.

*

* - For more general detection routines, see existing projects / resources:

* - MLang - Microsoft library originally for IE6, available in Windows XP and later APIs now (I think?)

* - MLang .Net bindings: http://www.codeproject.com/KB/recipes/DetectEncoding.aspx

* - CharDet - Mozilla browser's detection routines

* - Ported to Java then .Net: http://www.conceptdevelopment.net/Localization/NCharDet/

* - Ported straight to .Net: http://code.google.com/p/chardetsharp/source/browse

*

* Copyright Tao Klerks, Jan 2010, tao@klerks.biz

* Licensed under the modified BSD license:

*

  

Redistribution and use in source and binary forms, with or without modification, are

permitted provided that the following conditions are met:

  

- Redistributions of source code must retain the above copyright notice, this list of

conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright notice, this list

of conditions and the following disclaimer in the documentation and/or other materials

provided with the distribution.

- The name of the author may not be used to endorse or promote products derived from

this software without specific prior written permission.

  

THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,

INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY

DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY

OF SUCH DAMAGE.

  

*

*/

  

        

 const long _defaultHeuristicSampleSize = 0x10000; //completely

arbitrary - inappropriate for high numbers of files / high speed

requirements

  

        public static Encoding DetectTextFileEncoding(string InputFilename, Encoding DefaultEncoding)

        {

            using (FileStream textfileStream = File.OpenRead(InputFilename))

            {

                return DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize);

            }

        }

  

        

 public static Encoding DetectTextFileEncoding(FileStream

InputFileStream, Encoding DefaultEncoding, long HeuristicSampleSize)

        {

            if (InputFileStream == null)

                throw new ArgumentNullException("Must provide a valid Filestream!", "InputFileStream");

  

            if (!InputFileStream.CanRead)

                throw new ArgumentException("Provided file stream is not readable!", "InputFileStream");

  

            if (!InputFileStream.CanSeek)

                throw new ArgumentException("Provided file stream cannot seek!", "InputFileStream");

  

            Encoding encodingFound = null;

  

            long originalPos = InputFileStream.Position;

  

            InputFileStream.Position = 0;

  

            //First read only what we need for BOM detection

  

            byte[] bomBytes = new byte[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];

            InputFileStream.Read(bomBytes, 0, bomBytes.Length);

  

            encodingFound = DetectBOMBytes(bomBytes);

  

            if (encodingFound != null)

            {

                InputFileStream.Position = originalPos;

                return encodingFound;

            }

  

            //BOM Detection failed, going for heuristics now.

            // create sample byte array and populate it

            

 byte[] sampleBytes = new byte[HeuristicSampleSize >

InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];

            Array.Copy(bomBytes, sampleBytes, bomBytes.Length);

            if (InputFileStream.Length > bomBytes.Length)

                InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);

            InputFileStream.Position = originalPos;

  

            //test byte array content

            encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);

  

            if (encodingFound != null)

                return encodingFound;

            else

                return DefaultEncoding;

        }

  

        public static Encoding DetectTextByteArrayEncoding(byte[] TextData, Encoding DefaultEncoding)

        {

            if (TextData == null)

                throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");

  

            Encoding encodingFound = null;

  

            encodingFound = DetectBOMBytes(TextData);

  

            if (encodingFound != null)

            {

                return encodingFound;

            }

            else

            {

                //test byte array content

                encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);

  

                if (encodingFound != null)

                    return encodingFound;

                else

                    return DefaultEncoding;

            }

  

        }

  

        public static Encoding DetectBOMBytes(byte[] BOMBytes)

        {

            if (BOMBytes == null)

                throw new ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes");

  

            if (BOMBytes.Length < 2)

                return null;

  

            if (BOMBytes[0] == 0xff

                && BOMBytes[1] == 0xfe

                && (BOMBytes.Length < 4

                    || BOMBytes[2] != 0

                    || BOMBytes[3] != 0

                    )

                )

                return Encoding.Unicode;

  

            if (BOMBytes[0] == 0xfe

                && BOMBytes[1] == 0xff

                )

                return Encoding.BigEndianUnicode;

  

            if (BOMBytes.Length < 3)

                return null;

  

            if (BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)

                return Encoding.UTF8;

  

            if (BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)

                return Encoding.UTF7;

  

            if (BOMBytes.Length < 4)

                return null;

  

            if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)

                return Encoding.UTF32;

  

            if (BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)

                return Encoding.GetEncoding(12001);

  

            return null;

        }

  

        public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)

        {

            long oddBinaryNullsInSample = 0;

            long evenBinaryNullsInSample = 0;

            long suspiciousUTF8SequenceCount = 0;

            long suspiciousUTF8BytesTotal = 0;

            long likelyUSASCIIBytesInSample = 0;

  

            //Cycle through, keeping count of binary null positions, possible UTF-8

            // sequences from upper ranges of Windows-1252, and probable US-ASCII

            // character counts.

  

            long currentPos = 0;

            int skipUTF8Bytes = 0;

  

            while (currentPos < SampleBytes.Length)

            {

                //binary null distribution

                if (SampleBytes[currentPos] == 0)

                {

                    if (currentPos % 2 == 0)

                        evenBinaryNullsInSample++;

                    else

                        oddBinaryNullsInSample++;

                }

  

                //likely US-ASCII characters

                if (IsCommonUSASCIIByte(SampleBytes[currentPos]))

                    likelyUSASCIIBytesInSample++;

  

                //suspicious sequences (look like UTF-8)

                if (skipUTF8Bytes == 0)

                {

                    int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);

  

                    if (lengthFound > 0)

                    {

                        suspiciousUTF8SequenceCount++;

                        suspiciousUTF8BytesTotal += lengthFound;

                        skipUTF8Bytes = lengthFound - 1;

                    }

                }

                else

                {

                    skipUTF8Bytes--;

                }

  

                currentPos++;

            }

  

            //1: UTF-16 LE - in english / european environments, this is usually characterized by a

            // high proportion of odd binary nulls (starting at 0), with (as this is text) a low

            // proportion of even binary nulls.

            // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than

            // 60% nulls where you do expect nulls) are completely arbitrary.

  

            if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2

                && ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6

                )

                return Encoding.Unicode;

  

            //2: UTF-16 BE - in english / european environments, this is usually characterized by a

            // high proportion of even binary nulls (starting at 0), with (as this is text) a low

            // proportion of odd binary nulls.

            // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than

            // 60% nulls where you do expect nulls) are completely arbitrary.

  

            if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2

                && ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6

                )

                return Encoding.BigEndianUnicode;

  

            //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content

            // using regexp, in his w3c.org unicode FAQ entry:

            // http://www.w3.org/International/questions/qa-forms-utf-8

            // adapted here for C#.

            string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);

            Regex UTF8Validator = new Regex(@"\A("

                + @"[\x09\x0A\x0D\x20-\x7E]"

                + @"|[\xC2-\xDF][\x80-\xBF]"

                + @"|\xE0[\xA0-\xBF][\x80-\xBF]"

                + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"

                + @"|\xED[\x80-\x9F][\x80-\xBF]"

                + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"

                + @"|[\xF1-\xF3][\x80-\xBF]{3}"

                + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"

                + @")*\z");

            if (UTF8Validator.IsMatch(potentiallyMangledString))

            {

                //Unfortunately, just the fact that it CAN be UTF-8 doesn&#39;t tell you much about probabilities.

                

 //If all the characters are in the 0-127 range, no harm done, most

western charsets are same as UTF-8 in these ranges.

                //If some of the characters were in the upper range (western accented

characters), however, they would likely be mangled to 2-byte by the

UTF-8 encoding process.

                // So, we need to play stats.

  

                // The "Random" likelihood of any pair of randomly generated characters being one

                // of these "suspicious" character sequences is:

                // 128 / (256 * 256) = 0.2%.

                //

                // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127

                // character range, so we assume that more than 1 in 500,000 of these character

                // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.

                //

                // We can only assume these character sequences will be rare if we ALSO assume that this

                // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is

                // not already suspicious sequences) should be plain US-ASCII bytes. This, I

                // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield

                // approx 40%, so the chances of hitting this threshold by accident in random data are

                // VERY low).

  

                if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) //suspicious sequences

                    && (

                           //all suspicious, so cannot evaluate proportion of US-Ascii

                           SampleBytes.Length - suspiciousUTF8BytesTotal == 0

                           ||

                           likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8

                       )

                    )

                    return Encoding.UTF8;

            }

  

            return null;

        }

  

        private static bool IsCommonUSASCIIByte(byte testByte)

        {

            if (testByte == 0x0A //lf

                || testByte == 0x0D //cr

                || testByte == 0x09 //tab

                || (testByte >= 0x20 && testByte <= 0x2F) //common punctuation

                || (testByte >= 0x30 && testByte <= 0x39) //digits

                || (testByte >= 0x3A && testByte <= 0x40) //common punctuation

                || (testByte >= 0x41 && testByte <= 0x5A) //capital letters

                || (testByte >= 0x5B && testByte <= 0x60) //common punctuation

                || (testByte >= 0x61 && testByte <= 0x7A) //lowercase letters

                || (testByte >= 0x7B && testByte <= 0x7E) //common punctuation

                )

                return true;

            else

                return false;

        }

  

        private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)

        {

            int lengthFound = 0;

  

            if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC2

                )

            {

                if (SampleBytes[currentPos + 1] == 0x81

                    || SampleBytes[currentPos + 1] == 0x8D

                    || SampleBytes[currentPos + 1] == 0x8F

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] == 0x90

                    || SampleBytes[currentPos + 1] == 0x9D

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] >= 0xA0

                    && SampleBytes[currentPos + 1] <= 0xBF

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC3

                )

            {

                if (SampleBytes[currentPos + 1] >= 0x80

                    && SampleBytes[currentPos + 1] <= 0xBF

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC5

                )

            {

                if (SampleBytes[currentPos + 1] == 0x92

                    || SampleBytes[currentPos + 1] == 0x93

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] == 0xA0

                    || SampleBytes[currentPos + 1] == 0xA1

                    )

                    lengthFound = 2;

                else if (SampleBytes[currentPos + 1] == 0xB8

                    || SampleBytes[currentPos + 1] == 0xBD

                    || SampleBytes[currentPos + 1] == 0xBE

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xC6

                )

            {

                if (SampleBytes[currentPos + 1] == 0x92)

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 1

                && SampleBytes[currentPos] == 0xCB

                )

            {

                if (SampleBytes[currentPos + 1] == 0x86

                    || SampleBytes[currentPos + 1] == 0x9C

                    )

                    lengthFound = 2;

            }

            else if (SampleBytes.Length >= currentPos + 2

                && SampleBytes[currentPos] == 0xE2

                )

            {

                if (SampleBytes[currentPos + 1] == 0x80)

                {

                    if (SampleBytes[currentPos + 2] == 0x93

                        || SampleBytes[currentPos + 2] == 0x94

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0x98

                        || SampleBytes[currentPos + 2] == 0x99

                        || SampleBytes[currentPos + 2] == 0x9A

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0x9C

                        || SampleBytes[currentPos + 2] == 0x9D

                        || SampleBytes[currentPos + 2] == 0x9E

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xA0

                        || SampleBytes[currentPos + 2] == 0xA1

                        || SampleBytes[currentPos + 2] == 0xA2

                        )

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xA6)

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xB0)

                        lengthFound = 3;

                    if (SampleBytes[currentPos + 2] == 0xB9

                        || SampleBytes[currentPos + 2] == 0xBA

                        )

                        lengthFound = 3;

                }

                else if (SampleBytes[currentPos + 1] == 0x82

                    && SampleBytes[currentPos + 2] == 0xAC

                    )

                    lengthFound = 3;

                else if (SampleBytes[currentPos + 1] == 0x84

                    && SampleBytes[currentPos + 2] == 0xA2

                    )

                    lengthFound = 3;

            }

  

            return lengthFound;

        }

  

    }

}

登入後複製

使用方法:

1

Encoding fileEncoding = TextFileEncodingDetector.DetectTextFileEncoding("you file path",Encoding.Default);

登入後複製


本網站聲明
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn

熱AI工具

Undresser.AI Undress

Undresser.AI Undress

人工智慧驅動的應用程序,用於創建逼真的裸體照片

AI Clothes Remover

AI Clothes Remover

用於從照片中去除衣服的線上人工智慧工具。

Undress AI Tool

Undress AI Tool

免費脫衣圖片

Clothoff.io

Clothoff.io

AI脫衣器

Video Face Swap

Video Face Swap

使用我們完全免費的人工智慧換臉工具,輕鬆在任何影片中換臉!

熱門文章

<🎜>:泡泡膠模擬器無窮大 - 如何獲取和使用皇家鑰匙
4 週前 By 尊渡假赌尊渡假赌尊渡假赌
北端:融合系統,解釋
4 週前 By 尊渡假赌尊渡假赌尊渡假赌
Mandragora:巫婆樹的耳語 - 如何解鎖抓鉤
3 週前 By 尊渡假赌尊渡假赌尊渡假赌

熱工具

記事本++7.3.1

記事本++7.3.1

好用且免費的程式碼編輯器

SublimeText3漢化版

SublimeText3漢化版

中文版,非常好用

禪工作室 13.0.1

禪工作室 13.0.1

強大的PHP整合開發環境

Dreamweaver CS6

Dreamweaver CS6

視覺化網頁開發工具

SublimeText3 Mac版

SublimeText3 Mac版

神級程式碼編輯軟體(SublimeText3)

熱門話題

Java教學
1672
14
CakePHP 教程
1428
52
Laravel 教程
1332
25
PHP教程
1276
29
C# 教程
1256
24