Home Backend Development C#.Net Tutorial C# Convert HTML to plain text

C# Convert HTML to plain text

Nov 10, 2016 am 10:15 AM

/// <summary>
/// Converts HTML to plain text.
/// </summary>
class HtmlToText
{
    // Static data tables
    protected static Dictionary<string, string> _tags;
    protected static HashSet<string> _ignoreTags;
 
    // Instance variables
    protected TextBuilder _text;
    protected string _html;
    protected int _pos;
 
    // Static constructor (one time only)
    static HtmlToText()
    {
        _tags = new Dictionary<string, string>();
        _tags.Add("address", "\n");
        _tags.Add("blockquote", "\n");
        _tags.Add("div", "\n");
        _tags.Add("dl", "\n");
        _tags.Add("fieldset", "\n");
        _tags.Add("form", "\n");
        _tags.Add("h1", "\n");
        _tags.Add("/h1", "\n");
        _tags.Add("h2", "\n");
        _tags.Add("/h2", "\n");
        _tags.Add("h3", "\n");
        _tags.Add("/h3", "\n");
        _tags.Add("h4", "\n");
        _tags.Add("/h4", "\n");
        _tags.Add("h5", "\n");
        _tags.Add("/h5", "\n");
        _tags.Add("h6", "\n");
        _tags.Add("/h6", "\n");
        _tags.Add("p", "\n");
        _tags.Add("/p", "\n");
        _tags.Add("table", "\n");
        _tags.Add("/table", "\n");
        _tags.Add("ul", "\n");
        _tags.Add("/ul", "\n");
        _tags.Add("ol", "\n");
        _tags.Add("/ol", "\n");
        _tags.Add("/li", "\n");
        _tags.Add("br", "\n");
        _tags.Add("/td", "\t");
        _tags.Add("/tr", "\n");
        _tags.Add("/pre", "\n");
 
        _ignoreTags = new HashSet<string>();
        _ignoreTags.Add("script");
        _ignoreTags.Add("noscript");
        _ignoreTags.Add("style");
        _ignoreTags.Add("object");
    }
 
    /// <summary>
    /// Converts the given HTML to plain text and returns the result.
    /// </summary>
    /// <param name="html">HTML to be converted</param>
    /// <returns>Resulting plain text</returns>
    public string Convert(string html)
    {
        // Initialize state variables
        _text = new TextBuilder();
        _html = html;
        _pos = 0;
 
        // Process input
        while (!EndOfText)
        {
            if (Peek() == &#39;<&#39;)
            {
                // HTML tag
                bool selfClosing;
                string tag = ParseTag(out selfClosing);
 
                // Handle special tag cases
                if (tag == "body")
                {
                    // Discard content before <body>
                    _text.Clear();
                }
                else if (tag == "/body")
                {
                    // Discard content after </body>
                    _pos = _html.Length;
                }
                else if (tag == "pre")
                {
                    // Enter preformatted mode
                    _text.Preformatted = true;
                    EatWhitespaceToNextLine();
                }
                else if (tag == "/pre")
                {
                    // Exit preformatted mode
                    _text.Preformatted = false;
                }
 
                string value;
                if (_tags.TryGetValue(tag, out value))
                    _text.Write(value);
 
                if (_ignoreTags.Contains(tag))
                    EatInnerContent(tag);
            }
            else if (Char.IsWhiteSpace(Peek()))
            {
                // Whitespace (treat all as space)
                _text.Write(_text.Preformatted ? Peek() : &#39; &#39;);
                MoveAhead();
            }
            else
            {
                // Other text
                _text.Write(Peek());
                MoveAhead();
            }
        }
        // Return result
        return HttpUtility.HtmlDecode(_text.ToString());
    }
 
    // Eats all characters that are part of the current tag
    // and returns information about that tag
    protected string ParseTag(out bool selfClosing)
    {
        string tag = String.Empty;
        selfClosing = false;
 
        if (Peek() == &#39;<&#39;)
        {
            MoveAhead();
 
            // Parse tag name
            EatWhitespace();
            int start = _pos;
            if (Peek() == &#39;/&#39;)
                MoveAhead();
            while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
                Peek() != &#39;/&#39; && Peek() != &#39;>&#39;)
                MoveAhead();
            tag = _html.Substring(start, _pos - start).ToLower();
 
            // Parse rest of tag
            while (!EndOfText && Peek() != &#39;>&#39;)
            {
                if (Peek() == &#39;"&#39; || Peek() == &#39;\&#39;&#39;)
                    EatQuotedValue();
                else
                {
                    if (Peek() == &#39;/&#39;)
                        selfClosing = true;
                    MoveAhead();
                }
            }
            MoveAhead();
        }
        return tag;
    }
 
    // Consumes inner content from the current tag
    protected void EatInnerContent(string tag)
    {
        string endTag = "/" + tag;
 
        while (!EndOfText)
        {
            if (Peek() == &#39;<&#39;)
            {
                // Consume a tag
                bool selfClosing;
                if (ParseTag(out selfClosing) == endTag)
                    return;
                // Use recursion to consume nested tags
                if (!selfClosing && !tag.StartsWith("/"))
                    EatInnerContent(tag);
            }
            else MoveAhead();
        }
    }
 
    // Returns true if the current position is at the end of
    // the string
    protected bool EndOfText
    {
        get { return (_pos >= _html.Length); }
    }
 
    // Safely returns the character at the current position
    protected char Peek()
    {
        return (_pos < _html.Length) ? _html[_pos] : (char)0;
    }
 
    // Safely advances to current position to the next character
    protected void MoveAhead()
    {
        _pos = Math.Min(_pos + 1, _html.Length);
    }
 
    // Moves the current position to the next non-whitespace
    // character.
    protected void EatWhitespace()
    {
        while (Char.IsWhiteSpace(Peek()))
            MoveAhead();
    }
 
    // Moves the current position to the next non-whitespace
    // character or the start of the next line, whichever
    // comes first
    protected void EatWhitespaceToNextLine()
    {
        while (Char.IsWhiteSpace(Peek()))
        {
            char c = Peek();
            MoveAhead();
            if (c == &#39;\n&#39;)
                break;
        }
    }
 
    // Moves the current position past a quoted value
    protected void EatQuotedValue()
    {
        char c = Peek();
        if (c == &#39;"&#39; || c == &#39;\&#39;&#39;)
        {
            // Opening quote
            MoveAhead();
            // Find end of value
            int start = _pos;
            _pos = _html.IndexOfAny(new char[] { c, &#39;\r&#39;, &#39;\n&#39; }, _pos);
            if (_pos < 0)
                _pos = _html.Length;
            else
                MoveAhead();    // Closing quote
        }
    }
 
    /// <summary>
    /// A StringBuilder class that helps eliminate excess whitespace.
    /// </summary>
    protected class TextBuilder
    {
        private StringBuilder _text;
        private StringBuilder _currLine;
        private int _emptyLines;
        private bool _preformatted;
 
        // Construction
        public TextBuilder()
        {
            _text = new StringBuilder();
            _currLine = new StringBuilder();
            _emptyLines = 0;
            _preformatted = false;
        }
 
        /// <summary>
        /// Normally, extra whitespace characters are discarded.
        /// If this property is set to true, they are passed
        /// through unchanged.
        /// </summary>
        public bool Preformatted
        {
            get
            {
                return _preformatted;
            }
            set
            {
                if (value)
                {
                    // Clear line buffer if changing to
                    // preformatted mode
                    if (_currLine.Length > 0)
                        FlushCurrLine();
                    _emptyLines = 0;
                }
                _preformatted = value;
            }
        }
 
        /// <summary>
        /// Clears all current text.
        /// </summary>
        public void Clear()
        {
            _text.Length = 0;
            _currLine.Length = 0;
            _emptyLines = 0;
        }
 
        /// <summary>
        /// Writes the given string to the output buffer.
        /// </summary>
        /// <param name="s"></param>
        public void Write(string s)
        {
            foreach (char c in s)
                Write(c);
        }
 
        /// <summary>
        /// Writes the given character to the output buffer.
        /// </summary>
        /// <param name="c">Character to write</param>
        public void Write(char c)
        {
            if (_preformatted)
            {
                // Write preformatted character
                _text.Append(c);
            }
            else
            {
                if (c == &#39;\r&#39;)
                {
                    // Ignore carriage returns. We&#39;ll process
                    // &#39;\n&#39; if it comes next
                }
                else if (c == &#39;\n&#39;)
                {
                    // Flush current line
                    FlushCurrLine();
                }
                else if (Char.IsWhiteSpace(c))
                {
                    // Write single space character
                    int len = _currLine.Length;
                    if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
                        _currLine.Append(&#39; &#39;);
                }
                else
                {
                    // Add character to current line
                    _currLine.Append(c);
                }
            }
        }
 
        // Appends the current line to output buffer
        protected void FlushCurrLine()
        {
            // Get current line
            string line = _currLine.ToString().Trim();
 
            // Determine if line contains non-space characters
            string tmp = line.Replace(" ", String.Empty);
            if (tmp.Length == 0)
            {
                // An empty line
                _emptyLines++;
                if (_emptyLines < 2 && _text.Length > 0)
                    _text.AppendLine(line);
            }
            else
            {
                // A non-empty line
                _emptyLines = 0;
                _text.AppendLine(line);
            }
 
            // Reset current line
            _currLine.Length = 0;
        }
 
        /// <summary>
        /// Returns the current output as a string.
        /// </summary>
        public override string ToString()
        {
            if (_currLine.Length > 0)
                FlushCurrLine();
            return _text.ToString();
        }
    }
}
Copy after login

Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn

Hot AI Tools

Undresser.AI Undress

Undresser.AI Undress

AI-powered app for creating realistic nude photos

AI Clothes Remover

AI Clothes Remover

Online AI tool for removing clothes from photos.

Undress AI Tool

Undress AI Tool

Undress images for free

Clothoff.io

Clothoff.io

AI clothes remover

AI Hentai Generator

AI Hentai Generator

Generate AI Hentai for free.

Hot Tools

Notepad++7.3.1

Notepad++7.3.1

Easy-to-use and free code editor

SublimeText3 Chinese version

SublimeText3 Chinese version

Chinese version, very easy to use

Zend Studio 13.0.1

Zend Studio 13.0.1

Powerful PHP integrated development environment

Dreamweaver CS6

Dreamweaver CS6

Visual web development tools

SublimeText3 Mac version

SublimeText3 Mac version

God-level code editing software (SublimeText3)

How to handle special characters in C language How to handle special characters in C language Apr 03, 2025 pm 03:18 PM

In C language, special characters are processed through escape sequences, such as: \n represents line breaks. \t means tab character. Use escape sequences or character constants to represent special characters, such as char c = '\n'. Note that the backslash needs to be escaped twice. Different platforms and compilers may have different escape sequences, please consult the documentation.

What is the role of char in C strings What is the role of char in C strings Apr 03, 2025 pm 03:15 PM

In C, the char type is used in strings: 1. Store a single character; 2. Use an array to represent a string and end with a null terminator; 3. Operate through a string operation function; 4. Read or output a string from the keyboard.

The difference between char and wchar_t in C language The difference between char and wchar_t in C language Apr 03, 2025 pm 03:09 PM

In C language, the main difference between char and wchar_t is character encoding: char uses ASCII or extends ASCII, wchar_t uses Unicode; char takes up 1-2 bytes, wchar_t takes up 2-4 bytes; char is suitable for English text, wchar_t is suitable for multilingual text; char is widely supported, wchar_t depends on whether the compiler and operating system support Unicode; char is limited in character range, wchar_t has a larger character range, and special functions are used for arithmetic operations.

How to use various symbols in C language How to use various symbols in C language Apr 03, 2025 pm 04:48 PM

The usage methods of symbols in C language cover arithmetic, assignment, conditions, logic, bit operators, etc. Arithmetic operators are used for basic mathematical operations, assignment operators are used for assignment and addition, subtraction, multiplication and division assignment, condition operators are used for different operations according to conditions, logical operators are used for logical operations, bit operators are used for bit-level operations, and special constants are used to represent null pointers, end-of-file markers, and non-numeric values.

The difference between multithreading and asynchronous c# The difference between multithreading and asynchronous c# Apr 03, 2025 pm 02:57 PM

The difference between multithreading and asynchronous is that multithreading executes multiple threads at the same time, while asynchronously performs operations without blocking the current thread. Multithreading is used for compute-intensive tasks, while asynchronously is used for user interaction. The advantage of multi-threading is to improve computing performance, while the advantage of asynchronous is to not block UI threads. Choosing multithreading or asynchronous depends on the nature of the task: Computation-intensive tasks use multithreading, tasks that interact with external resources and need to keep UI responsiveness use asynchronous.

How to convert char in C language How to convert char in C language Apr 03, 2025 pm 03:21 PM

In C language, char type conversion can be directly converted to another type by: casting: using casting characters. Automatic type conversion: When one type of data can accommodate another type of value, the compiler automatically converts it.

What is the function of C language sum? What is the function of C language sum? Apr 03, 2025 pm 02:21 PM

There is no built-in sum function in C language, so it needs to be written by yourself. Sum can be achieved by traversing the array and accumulating elements: Loop version: Sum is calculated using for loop and array length. Pointer version: Use pointers to point to array elements, and efficient summing is achieved through self-increment pointers. Dynamically allocate array version: Dynamically allocate arrays and manage memory yourself, ensuring that allocated memory is freed to prevent memory leaks.

How to use char array in C language How to use char array in C language Apr 03, 2025 pm 03:24 PM

The char array stores character sequences in C language and is declared as char array_name[size]. The access element is passed through the subscript operator, and the element ends with the null terminator '\0', which represents the end point of the string. The C language provides a variety of string manipulation functions, such as strlen(), strcpy(), strcat() and strcmp().

See all articles