package test;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public
class
Webpage {
private
String pageUrl;
private
String pageEncode=
"UTF8"
;
public
String getPageUrl() {
return
pageUrl;
}
public
void setPageUrl(String pageUrl) {
this.pageUrl = pageUrl;
}
public
String getPageEncode() {
return
pageEncode;
}
public
void setPageEncode(String pageEncode) {
this.pageEncode = pageEncode;
}
public
String getPageSource()
{
StringBuffer sb =
new
StringBuffer();
try
{
URL url =
new
URL(pageUrl);
BufferedReader in =
new
BufferedReader(
new
InputStreamReader(url.openStream(), pageEncode));
String line;
while
((line = in.readLine()) != null)
{
sb.append(line);
}
in.close();
}
catch
(Exception ex)
{
System.err.println(ex);
}
return
sb.toString();
}
public
String getPageSourceWithoutHtml()
{
final
String regEx_script =
"<script[^>]*?>[\\s\\S]*?<\\/script>"
;
final
String regEx_style =
"<style[^>]*?>[\\s\\S]*?<\\/style>"
;
final
String regEx_html =
"<[^>]+>"
;
final
String regEx_space =
"\\s*|\t|\r|\n"
;
String htmlStr = getPageSource();
Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
Matcher m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(
""
);
Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
Matcher m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(
""
);
Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
Matcher m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(
""
);
Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
Matcher m_space = p_space.matcher(htmlStr);
htmlStr = m_space.replaceAll(
""
);
htmlStr = htmlStr.trim();
htmlStr = htmlStr.replaceAll(
" "
,
""
);
htmlStr = htmlStr.substring(0, htmlStr.indexOf(
"。"
)+1);
return
htmlStr;
}
}