How to extract the body of html and keep some content?
Text extraction is to remove the <> content in the html code. This code adds the option to retain certain <> content.
1 using System; 2 using System.Text; 3 namespace HtmlStrip 4 { 5 class MainClass 6 { 7 public static void Main (string[] args) 8 { 9 string str = "<p>abc</p><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo"; 10 //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html"); 11 //str=rd.ReadToEnd (); 12 HtmlParser t = new HtmlParser (str); // 13 t.KeepTag (new string[] { "br" }); //设置br标签不过虑 14 Console.Write (t.Text ()); 15 } 16 17 18 19 } 20 class HtmlParser 21 { 22 private string[] htmlcode; //把html转为数组形式用于分析 23 private StringBuilder result = new StringBuilder (); //输出的结果 24 private int seek; //分析文本时候的指针位置 25 private string[] keepTag; //用于保存要保留的尖括号内容 26 private bool _inTag; //标记现在的指针是不是在尖括号内 27 private bool needContent = true; //是否要提取正文 28 private string tagName; //当前尖括号的名字 29 private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容,一般这些标签的正文是不要的 30 31 /// <summary> 32 /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字 33 /// </summary> 34 public bool inTag { 35 get { return _inTag; } 36 set { 37 _inTag = value; 38 if (!value) 39 return; 40 bool ok = true; 41 tagName = ""; 42 while (ok) { 43 string word = read (); 44 if (word != " " && word != ">") { 45 tagName += word; 46 } else if (word == " " && tagName.Length > 0) { 47 ok = false; 48 } else if (word == ">") { 49 ok = false; 50 inTag = false; 51 seek -= 1; 52 } 53 } 54 } 55 } 56 /// <summary> 57 /// 初始化类 58 /// </summary> 59 /// <param name="html"> 60 /// 要分析的html代码 61 /// </param> 62 public HtmlParser (string html) 63 { 64 htmlcode = new string[html.Length]; 65 for (int i = 0; i < html.Length; i++) { 66 htmlcode[i] = html[i].ToString (); 67 } 68 KeepTag (new string[] { }); 69 } 70 /// <summary> 71 /// 设置要保存那些标签不要被过滤掉 72 /// </summary> 73 /// <param name="tags"> 74 /// 75 /// </param> 76 public void KeepTag (string[] tags) 77 { 78 keepTag = tags; 79 } 80 81 /// <summary> 82 /// 83 /// </summary> 84 /// <returns> 85 /// 输出处理后的文本 86 /// </returns> 87 public string Text () 88 { 89 int startTag = 0; 90 int endTag = 0; 91 while (seek < htmlcode.Length) { 92 string word = read (); 93 if (word.ToLower () == "<") { 94 startTag = seek; 95 inTag = true; 96 } else if (word.ToLower () == ">") { 97 endTag = seek; 98 inTag = false; 99 if (iskeepTag (tagName.Replace ("/", ""))) { 100 for (int i = startTag - 1; i < endTag; i++) { 101 result.Append (htmlcode[i].ToString ()); 102 } 103 } else if (tagName.StartsWith ("!--")) { 104 bool ok = true; 105 while (ok) { 106 if (read () == "-") { 107 if (read () == "-") { 108 if (read () == ">") { 109 ok = false; 110 } else { 111 seek -= 1; 112 } 113 } 114 } 115 } 116 } else { 117 foreach (string str in specialTag) { 118 if (tagName == str) { 119 needContent = false; 120 break; 121 } else 122 needContent = true; 123 } 124 } 125 } else if (!inTag && needContent) { 126 result.Append (word); 127 } 128 129 } 130 return result.ToString (); 131 } 132 /// <summary> 133 /// 判断是否要保存这个标签 134 /// </summary> 135 /// <param name="tag"> 136 /// A <see cref="System.String"/> 137 /// </param> 138 /// <returns> 139 /// A <see cref="System.Boolean"/> 140 /// </returns> 141 private bool iskeepTag (string tag) 142 { 143 foreach (string ta in keepTag) { 144 if (tag.ToLower () == ta.ToLower ()) { 145 return true; 146 } 147 } 148 return false; 149 } 150 private string read () 151 { 152 return htmlcode[seek++]; 153 } 154 155 } 156 } 157
The above is the detailed content of How to extract the body of html and keep some content?. For more information, please follow other related articles on the PHP Chinese website!

Hot AI Tools

Undresser.AI Undress
AI-powered app for creating realistic nude photos

AI Clothes Remover
Online AI tool for removing clothes from photos.

Undress AI Tool
Undress images for free

Clothoff.io
AI clothes remover

Video Face Swap
Swap faces in any video effortlessly with our completely free AI face swap tool!

Hot Article

Hot Tools

Notepad++7.3.1
Easy-to-use and free code editor

SublimeText3 Chinese version
Chinese version, very easy to use

Zend Studio 13.0.1
Powerful PHP integrated development environment

Dreamweaver CS6
Visual web development tools

SublimeText3 Mac version
God-level code editing software (SublimeText3)

Hot Topics



Guide to Table Border in HTML. Here we discuss multiple ways for defining table-border with examples of the Table Border in HTML.

Guide to HTML margin-left. Here we discuss a brief overview on HTML margin-left and its Examples along with its Code Implementation.

This is a guide to Nested Table in HTML. Here we discuss how to create a table within the table along with the respective examples.

Guide to HTML Table Layout. Here we discuss the Values of HTML Table Layout along with the examples and outputs n detail.

Guide to HTML Input Placeholder. Here we discuss the Examples of HTML Input Placeholder along with the codes and outputs.

Guide to the HTML Ordered List. Here we also discuss introduction of HTML Ordered list and types along with their example respectively

Guide to Moving Text in HTML. Here we discuss an introduction, how marquee tag work with syntax and examples to implement.

Guide to HTML onclick Button. Here we discuss their introduction, working, examples and onclick Event in various events respectively.
