Jsoup爬虫的简单应用 - PHPBLOG

Home > List of blog posts > Jsoup爬虫的简单应用

Blogger Information

Blog 5

fans 0

comment 0

visits 5937

Special Recommendation

More>

Related recommendations

Related Tutorials

Popular Recommendations

Latest courses

The latest ThinkPHP 5.1 world premiere video tutorial (60 days to become a PHP expert online training course)

1426956 times of learning
Collection
PHP introductory tutorial one: Learn PHP in one week

4276858 times of learning
Collection
JAVA Beginner's Video Tutorial

2574049 times of learning
Collection

Latest Downloads

More>

Web Effects

Website Source Code

Website Materials

Front End Template

Jsoup爬虫的简单应用

小不懂的博客

Original

2567 people have browsed it

package com.http.client;import java.io.IOException;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;/医院
*
* @author oo
* @date 2018-04-04 */public class MyHttpClient { private static Logger logger = Logger.getLogger(MyHttpClient.class); /医院
* 需求：使用httpclient 爬取网站数据
*
* @param args */
public static void main(String[] args) { // 创建ＨttpClient 对象
HttpClient hclient = new DefaultHttpClient(); // 设置响应时间传输源码时间代理服务器（设置代理服务器的目的是：防止爬数据被封ip）
hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000)
.setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000)
.setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123));

HttpGet hGet = new HttpGet("http://www.swordsign.com/");
String content = ""; try { // 向网站发送请求，获取网页源码
HttpResponse execute = hclient.execute(hGet); // EntityUtils工具类把网页实体转换成字符串
content = EntityUtils.toString(execute.getEntity(), "utf-8");
} catch (ClientProtocolException e) {
e.printStackTrace();
logger.error("医院医院医院医院ClientProtocolException" + e);
} catch (IOException e) {
e.printStackTrace();
logger.error("医院医院医院医院IOException" + e);
}

System.out.println(content);
}

}

　使用Jsoup进行请求：

package com.http.client;import java.io.IOException;import org.apache.log4j.Logger;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class ＭyJsoup { private static Logger logger = Logger.getLogger(ＭyJsoup.class); public static void main(String[] args) { try { // 使用jsoup 发送请求
Document document = Jsoup.connect("http://www.swordsign.com/").get();// System.out.println(document);
Elements elements = document.getElementsByTag("a");
String val = elements.text();
System.out.println(val);
for (Element element : elements) {
System.out.println(element.text()+":"+element.attr("href"));
}
} catch (IOException e) {
e.printStackTrace();
logger.error("医院医院医院医院医院*IOException:　连接失败" + e);
}
}

}

　　HttpClient 结合Jsoup：

1 package com.http.client; 2 3 import java.io.IOException; 4 5 import org.apache.http.HttpResponse; 6 import org.apache.http.client.ClientProtocolException; 7 import org.apache.http.client.HttpClient; 8 import org.apache.http.client.methods.HttpGet; 9 import org.apache.http.impl.client.DefaultHttpClient;10 import org.apache.http.util.EntityUtils;11 import org.jsoup.Jsoup;12 import org.jsoup.nodes.Document;13 import org.jsoup.nodes.Element;14 import org.jsoup.select.Elements;15 16 public class HttpCLientAndJsoup {17 18 public static void main(String[] args) throws ClientProtocolException, IOException

{19 // 创建HttpClient对象20 HttpClient hClient = new DefaultHttpClient();21 // 爬虫URL大部分都是get请求，创建get请求对象22 HttpGet hget = new HttpGet("http://www.swordsign.com/");23 // 向网站发送请求，获取网页源码24 HttpResponse response = hClient.execute(hget);25 // EntityUtils工具类把网页实体转换成字符串26

String content = EntityUtils.toString(response.getEntity(), "utf-8");27

// Jsoup负责解析网页28 Document doc = Jsoup.parse(content);29

// 使用元素选择器选择网页内容30 Elements elements = doc.select("div.salary_con li");31 // System.out.println(elements.text());32 for (Element element : elements) {33 String text = element.text();34 System.out.println(text);35 }36 37 }38 39 }

Statement of this Website

The copyright of this blog article belongs to the blogger. Please specify the address when reprinting! If there is any infringement or violation of the law, please contact admin@php.cn Report processing!

All comments Speak rationally on civilized internet, please comply with News Comment Service Agreement

0 comments

Author's latest blog post

如何优化HTML5在移动设置

2018-04-21 17:31:30