C# 将 HTML 转成纯文本
/// <summary> /// Converts HTML to plain text. /// </summary> class HtmlToText { // Static data tables protected static Dictionary<string, string> _tags; protected static HashSet<string> _ignoreTags; // Instance variables protected TextBuilder _text; protected string _html; protected int _pos; // Static constructor (one time only) static HtmlToText() { _tags = new Dictionary<string, string>(); _tags.Add("address", "\n"); _tags.Add("blockquote", "\n"); _tags.Add("div", "\n"); _tags.Add("dl", "\n"); _tags.Add("fieldset", "\n"); _tags.Add("form", "\n"); _tags.Add("h1", "\n"); _tags.Add("/h1", "\n"); _tags.Add("h2", "\n"); _tags.Add("/h2", "\n"); _tags.Add("h3", "\n"); _tags.Add("/h3", "\n"); _tags.Add("h4", "\n"); _tags.Add("/h4", "\n"); _tags.Add("h5", "\n"); _tags.Add("/h5", "\n"); _tags.Add("h6", "\n"); _tags.Add("/h6", "\n"); _tags.Add("p", "\n"); _tags.Add("/p", "\n"); _tags.Add("table", "\n"); _tags.Add("/table", "\n"); _tags.Add("ul", "\n"); _tags.Add("/ul", "\n"); _tags.Add("ol", "\n"); _tags.Add("/ol", "\n"); _tags.Add("/li", "\n"); _tags.Add("br", "\n"); _tags.Add("/td", "\t"); _tags.Add("/tr", "\n"); _tags.Add("/pre", "\n"); _ignoreTags = new HashSet<string>(); _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); } /// <summary> /// Converts the given HTML to plain text and returns the result. /// </summary> /// <param name="html">HTML to be converted</param> /// <returns>Resulting plain text</returns> public string Convert(string html) { // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) { if (Peek() == '<') { // HTML tag bool selfClosing; string tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == "body") { // Discard content before <body> _text.Clear(); } else if (tag == "/body") { // Discard content after </body> _pos = _html.Length; } else if (tag == "pre") { // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); } else if (tag == "/pre") { // Exit preformatted mode _text.Preformatted = false; } string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); } else if (Char.IsWhiteSpace(Peek())) { // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : ' '); MoveAhead(); } else { // Other text _text.Write(Peek()); MoveAhead(); } } // Return result return HttpUtility.HtmlDecode(_text.ToString()); } // Eats all characters that are part of the current tag // and returns information about that tag protected string ParseTag(out bool selfClosing) { string tag = String.Empty; selfClosing = false; if (Peek() == '<') { MoveAhead(); // Parse tag name EatWhitespace(); int start = _pos; if (Peek() == '/') MoveAhead(); while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '/' && Peek() != '>') MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText && Peek() != '>') { if (Peek() == '"' || Peek() == '\'') EatQuotedValue(); else { if (Peek() == '/') selfClosing = true; MoveAhead(); } } MoveAhead(); } return tag; } // Consumes inner content from the current tag protected void EatInnerContent(string tag) { string endTag = "/" + tag; while (!EndOfText) { if (Peek() == '<') { // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing && !tag.StartsWith("/")) EatInnerContent(tag); } else MoveAhead(); } } // Returns true if the current position is at the end of // the string protected bool EndOfText { get { return (_pos >= _html.Length); } } // Safely returns the character at the current position protected char Peek() { return (_pos < _html.Length) ? _html[_pos] : (char)0; } // Safely advances to current position to the next character protected void MoveAhead() { _pos = Math.Min(_pos + 1, _html.Length); } // Moves the current position to the next non-whitespace // character. protected void EatWhitespace() { while (Char.IsWhiteSpace(Peek())) MoveAhead(); } // Moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void EatWhitespaceToNextLine() { while (Char.IsWhiteSpace(Peek())) { char c = Peek(); MoveAhead(); if (c == '\n') break; } } // Moves the current position past a quoted value protected void EatQuotedValue() { char c = Peek(); if (c == '"' || c == '\'') { // Opening quote MoveAhead(); // Find end of value int start = _pos; _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos); if (_pos < 0) _pos = _html.Length; else MoveAhead(); // Closing quote } } /// <summary> /// A StringBuilder class that helps eliminate excess whitespace. /// </summary> protected class TextBuilder { private StringBuilder _text; private StringBuilder _currLine; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() { _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; } /// <summary> /// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// </summary> public bool Preformatted { get { return _preformatted; } set { if (value) { // Clear line buffer if changing to // preformatted mode if (_currLine.Length > 0) FlushCurrLine(); _emptyLines = 0; } _preformatted = value; } } /// <summary> /// Clears all current text. /// </summary> public void Clear() { _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; } /// <summary> /// Writes the given string to the output buffer. /// </summary> /// <param name="s"></param> public void Write(string s) { foreach (char c in s) Write(c); } /// <summary> /// Writes the given character to the output buffer. /// </summary> /// <param name="c">Character to write</param> public void Write(char c) { if (_preformatted) { // Write preformatted character _text.Append(c); } else { if (c == '\r') { // Ignore carriage returns. We'll process // '\n' if it comes next } else if (c == '\n') { // Flush current line FlushCurrLine(); } else if (Char.IsWhiteSpace(c)) { // Write single space character int len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append(' '); } else { // Add character to current line _currLine.Append(c); } } } // Appends the current line to output buffer protected void FlushCurrLine() { // Get current line string line = _currLine.ToString().Trim(); // Determine if line contains non-space characters string tmp = line.Replace(" ", String.Empty); if (tmp.Length == 0) { // An empty line _emptyLines++; if (_emptyLines < 2 && _text.Length > 0) _text.AppendLine(line); } else { // A non-empty line _emptyLines = 0; _text.AppendLine(line); } // Reset current line _currLine.Length = 0; } /// <summary> /// Returns the current output as a string. /// </summary> public override string ToString() { if (_currLine.Length > 0) FlushCurrLine(); return _text.ToString(); } } }

热AI工具

Undresser.AI Undress
人工智能驱动的应用程序,用于创建逼真的裸体照片

AI Clothes Remover
用于从照片中去除衣服的在线人工智能工具。

Undress AI Tool
免费脱衣服图片

Clothoff.io
AI脱衣机

AI Hentai Generator
免费生成ai无尽的。

热门文章

热工具

记事本++7.3.1
好用且免费的代码编辑器

SublimeText3汉化版
中文版,非常好用

禅工作室 13.0.1
功能强大的PHP集成开发环境

Dreamweaver CS6
视觉化网页开发工具

SublimeText3 Mac版
神级代码编辑软件(SublimeText3)

热门话题

C语言中通过转义序列处理特殊字符,如:\n表示换行符。\t表示制表符。使用转义序列或字符常量表示特殊字符,如char c = '\n'。注意,反斜杠需要转义两次。不同平台和编译器可能有不同的转义序列,请查阅文档。

在 C 语言中,char 类型在字符串中用于:1. 存储单个字符;2. 使用数组表示字符串并以 null 终止符结束;3. 通过字符串操作函数进行操作;4. 从键盘读取或输出字符串。

C 语言中符号的使用方法涵盖算术、赋值、条件、逻辑、位运算符等。算术运算符用于基本数学运算,赋值运算符用于赋值和加减乘除赋值,条件运算符用于根据条件执行不同操作,逻辑运算符用于逻辑操作,位运算符用于位级操作,特殊常量用于表示空指针、文件结束标记和非数字值。

在 C 语言中,char 和 wchar_t 的主要区别在于字符编码:char 使用 ASCII 或扩展 ASCII,wchar_t 使用 Unicode;char 占用 1-2 个字节,wchar_t 占用 2-4 个字节;char 适用于英语文本,wchar_t 适用于多语言文本;char 广泛支持,wchar_t 依赖于编译器和操作系统是否支持 Unicode;char 的字符范围受限,wchar_t 的字符范围更大,并使用专门的函数进行算术运算。

多线程和异步的区别在于,多线程同时执行多个线程,而异步在不阻塞当前线程的情况下执行操作。多线程用于计算密集型任务,而异步用于用户交互操作。多线程的优势是提高计算性能,异步的优势是不阻塞 UI 线程。选择多线程还是异步取决于任务性质:计算密集型任务使用多线程,与外部资源交互且需要保持 UI 响应的任务使用异步。

在 C 语言中,char 类型转换可以通过:强制类型转换:使用强制类型转换符将一种类型的数据直接转换为另一种类型。自动类型转换:当一种类型的数据可以容纳另一种类型的值时,编译器自动进行转换。

char 数组在 C 语言中存储字符序列,声明为 char array_name[size]。访问元素通过下标运算符,元素以空终止符 '\0' 结尾,用于表示字符串终点。C 语言提供多种字符串操作函数,如 strlen()、strcpy()、strcat() 和 strcmp()。

C语言中没有内置求和函数,需自行编写。可通过遍历数组并累加元素实现求和:循环版本:使用for循环和数组长度计算求和。指针版本:使用指针指向数组元素,通过自增指针遍历高效求和。动态分配数组版本:动态分配数组并自行管理内存,确保释放已分配内存以防止内存泄漏。
