Blogger Information
Blog 5
fans 0
comment 0
visits 5937
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template
Jsoup爬虫的简单应用
小不懂的博客
Original
2567 people have browsed it

package com.http.client;import java.io.IOException;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;/医院
*
* @author oo
* @date 2018-04-04 */public class MyHttpClient {    private static Logger logger = Logger.getLogger(MyHttpClient.class);    /医院
    * 需求:使用httpclient 爬取 网站数据
    *
    * @param args     */
   public static void main(String[] args) {        // 创建HttpClient 对象
       HttpClient hclient = new DefaultHttpClient();        // 设置响应时间 传输源码时间 代理服务器(设置代理服务器的目的是:防止爬数据被封ip)
       hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000)
               .setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000)
               .setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123));

       HttpGet hGet = new HttpGet("http://www.swordsign.com/");
       String content = "";        try {            // 向网站发送请求,获取网页源码
           HttpResponse execute = hclient.execute(hGet);            // EntityUtils工具类把网页实体转换成字符串
           content = EntityUtils.toString(execute.getEntity(), "utf-8");
       } catch (ClientProtocolException e) {
           e.printStackTrace();
           logger.error("医院医院医院医院ClientProtocolException" + e);
       } catch (IOException e) {
           e.printStackTrace();
           logger.error("医院医院医院医院IOException" + e);
       }

       System.out.println(content);
   }

}


 使用Jsoup进行请求:

package com.http.client;import java.io.IOException;import org.apache.log4j.Logger;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class MyJsoup {    private static Logger logger = Logger.getLogger(MyJsoup.class);    public static void main(String[] args) {        try {            // 使用jsoup 发送请求
           Document document = Jsoup.connect("http://www.swordsign.com/").get();//            System.out.println(document);
           Elements elements = document.getElementsByTag("a");
           String val = elements.text();
           System.out.println(val);            
           for (Element element : elements) {
               System.out.println(element.text()+":"+element.attr("href"));
           }
       } catch (IOException e) {
           e.printStackTrace();
           logger.error("医院医院医院医院医院*IOException: 连接失败" + e);
       }
   }

}

  HttpClient 结合Jsoup:

1 package com.http.client; 2  3 import java.io.IOException; 4  5 import org.apache.http.HttpResponse; 6 import org.apache.http.client.ClientProtocolException; 7 import org.apache.http.client.HttpClient; 8 import org.apache.http.client.methods.HttpGet; 9 import org.apache.http.impl.client.DefaultHttpClient;10 import org.apache.http.util.EntityUtils;11 import org.jsoup.Jsoup;12 import org.jsoup.nodes.Document;13 import org.jsoup.nodes.Element;14 import org.jsoup.select.Elements;15 16 public class HttpCLientAndJsoup {17 18     public static void main(String[] args) throws ClientProtocolException, IOException 

{19         // 创建HttpClient对象20         HttpClient hClient = new DefaultHttpClient();21         // 爬虫URL大部分都是get请求,创建get请求对象22         HttpGet hget = new HttpGet("http://www.swordsign.com/");23         // 向网站发送请求,获取网页源码24         HttpResponse response = hClient.execute(hget);25         // EntityUtils工具类把网页实体转换成字符串26  

       String content = EntityUtils.toString(response.getEntity(), "utf-8");27    

     // Jsoup负责解析网页28         Document doc = Jsoup.parse(content);29   

      // 使用元素选择器选择网页内容30         Elements elements = doc.select("div.salary_con li");31         // System.out.println(elements.text());32         for (Element element : elements) {33             String text = element.text();34             System.out.println(text);35         }36 37     }38 39 }

 


Statement of this Website
The copyright of this blog article belongs to the blogger. Please specify the address when reprinting! If there is any infringement or violation of the law, please contact admin@php.cn Report processing!
All comments Speak rationally on civilized internet, please comply with News Comment Service Agreement
0 comments
Author's latest blog post