定义一个http抓包类,发送数据到一个自定义的接受脚本,可以发送成功,并收取数据,但是发送到外网,却不行,分析过在浏览器下发送HTTP请求时的request header 信息,通过模拟请求,但超时...
//定义一个HTTP抓包类,其实也可以用curl。。。。。<?php ini_set('error_reporting', E_ALL);class Httpwrap{ private $hostInfo=null; private $requestLine=null; private $requestHeader=null; private $emptyLine="\r\n"; private $requestBody=null; private $requestEntity=null; private $responseEntity=null; private $responseHeader=null; private $responseBody=null; private $emptyLinePos=null; private $connect=null; private $errNo=null; private $errStr=null; public function __construct($url) { $this->hostInfo=parse_url($url); $this->setRequestHeader(array('Host' => $this->hostInfo['host'])); $this->setRequestHeader(array('Connection' => 'keep-alive')); } //设置HTTP请求行信息,例如: GET /resources HTTP/1.1 //但为了避免漏掉url中?开始的查询信息,有必要进行判断 public function setRequestLine($method) { //如果是POST请求,则自动添加content-type头信息 if(strtolower($method)=='post') { $this->setRequestHeader(array('Content-type' => 'application/x-www-form-urlencoded')); } if(!empty($this->hostInfo['query'])) { $this->requestLine=strtoupper($method)." ".$this->hostInfo['path']."?".$this->hostInfo['query']." HTTP/1.1 \r\n"; } else { $this->requestLine=strtoupper($method)." ".$this->hostInfo['path']." HTTP/1.1 \r\n"; } } //设置HTTP请求头。 //接收参数是数组类型,通过迭代拼接key:value,并换行 public function setRequestHeader($header) { foreach($header as $key => $value) { $this->requestHeader .=$key.":".$value."\r\n"; } } //设置HTTP请求体 //接收参数是数组类型,通过迭代拼接key=value,因为最后一席拼接会有一个多余的&,所以有必要去掉 public function setRequestBody($body) { foreach($body as $key => $value) { $this->requestBody .=$key.'='.$value.'&'; } $offset=strrpos($this->requestBody, '&'); $this->requestBody=substr($this->requestBody, 0, $offset); } //组装 请求行+请求头+请求体,并根据请求体的长度,自动填充请求头的content-length字段 public function setRequestEntity() { if(!empty($this->requestBody)) { $contentLength=strlen($this->requestBody); $this->setRequestHeader(array('Content-length' => $contentLength)); $this->requestEntity=$this->requestLine.$this->requestHeader."\r\n".$this->requestBody; } else { $this->requestEntity=$this->requestLine.$this->requestHeader."\r\n"; } } //解析主机名的函数,暂时没有用上....... public function parseHost($url) { $pat='#http://([^/]+)#i'; if(preg_match($pat, $url, $match)) { return $match[1]; } else { echo '匹配主机信息失败<br />'; } } //创建到主机的连接 public function createConnect() { $this->connect=fsockopen($this->hostInfo['host'], 80, $this->errNo, $this->errStr) or die('连接主机失败'.$this->errStr); } //发送请求 public function sendRequest() { $this->setRequestEntity(); echo $this->requestEntity; exit(); $this->createConnect(); $entityLength=strlen($this->requestEntity); if($entityLength != fwrite($this->connect, $this->requestEntity, $entityLength)) { die('写入数据失败<br />'); } else { $this->receiveResponse(); } } //接受请求,并依次拼接响应体 public function receiveResponse() { while(!feof($this->connect)) { $this->responseEntity .= fread($this->connect, 1024); } } //计算响应头与响应体之间的空行的位置 public function calculateEmptyLinePos() { $this->emptyLinePos=strpos($this->responseEntity,"\r\n\r\n",0); } //接受响应体的头部.... public function receiveResponseHeader() { $this->calculateEmptyLinePos(); $this->responseHeader=substr($this->responseEntity, 0, $this->emptyLinePos); echo $this->responseHeader; } //接收响应体的body部分 public function receiveResponseBody() { $this->calculateEmptyLinePos(); $this->responseBody=substr($this->responseEntity, $this->emptyLinePos); } //返回请求结果 public function getResponse() { return $this->responseEntity; } public function parseResponse() {} public function __destruct() { //fclose($this->connect); } }set_time_limit(60);$http=new Httpwrap("http://www.mmkao.com/Beautyleg/");//设置HTTP请求行$http->setRequestLine("get");//设置HTTP头$http->setRequestHeader(array("Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));$http->setRequestHeader(array("Accept-Language" => "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"));$http->setRequestHeader(array("Accept-Encoding" => "gzip, deflate"));$http->setRequestHeader(array("User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36"));//$http->setRequestHeader(array("Cookie" => "BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=6; AJSTAT_ok_pages=2; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712"));//发送数据$http->sendRequest();//$http->receiveResponseHeader();?>通过这个类给领一个自定义的脚本,可以发送和接收数据,领一个脚本如下:<?php if(!empty($_POST)){ $str=implode(',',$_POST); file_put_contents('./post.txt', $str,FILE_APPEND); echo $str;}?>但是给这个网站发送请求时,却超时:网站是:http://www.mmkao.com/Beautyleg/通过chrome给这个网站首页发送请求时的header头信息:Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8Accept-Encoding:gzip,deflate,sdchAccept-Language:zh,en;q=0.8,zh-TW;q=0.6,zh-CN;q=0.4,ja;q=0.2Cache-Control:max-age=0Connection:keep-aliveCookie:BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=7; AJSTAT_ok_pages=3; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712DNT:1Host:www.mmkao.comUser-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36Response Headersview source//通过相同的包装,并调用Httpwrap发送请求时,却提示超时,是在不知道哪里出问题........针对这个网站写了一个过滤出图片链接的类:<?php class Parseimage{ private $responseBody=null; private $imgLink=null; private $pageNum=null; private header=null; private body=null; public function __construct($body) { $this->responseBody=$body; } //匹配图片src开始的链接地址 public function feedImage() { $pat='#<img (.*?)src="(.*?)(?=")#i'; if(preg_match_all($pat, $body, $match)) { foreach($match[2] as $link) { $this- alt="自定义HTTP抓包和过滤" >imgLink[]=$link; } } else { echo '匹配失败图片链接地址失败'."<br />"; } } //提取head部分 public function filterHeader($body) { $pat='#<head>[\s\S]+</head>#im'; if(preg_match($pat, $body, $match)) { $this->header=$match[0]; } else { echo '匹配head部分失败'."<br />"; } } //提取body部分 public function filterBody($body) { $pat='#<body>[\s\S]+</body>#im'; if(preg_match($pat, $body, $match)) { $this->body=$match[0]; } else { echo '匹配body部分失败'."<br />"; } } //提取分页信息,这个只能针对性的匹配,不能通用 public function rollPage($body) { $pat='#[\x{4e00}-\x{9fa5}]+\s*\d\s+?/\s+?\d+\s*[\x{4e00}-\x{9fa5}]*#ui'; if(preg_match($pat, $body, $match)) { $patNum='#/\s*(\d\d*)#'; if(preg_match($patNum, $match[0], $num)) { $this->pageNum=$num[1]; } else { echo '提取分页具体值失败'."<br />"; } } else { echo '提取分页统计失败'."<br />"; } }?>附注: 这两个类,,都通过了内网的测试,并成功过滤出图片的链接,但是给http://www.mmkao.com/Beautyleg/发送请求时,却提示超时,,不知道哪里出了问题。。。。。。
在window的命令行下,提交请求,是可以收到数据的......
可以收到数据
$url = 'http://www.mmkao.com/Beautyleg/';print_r(get_headers($url));
Array( [0] => HTTP/1.1 200 OK [1] => Connection: close [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT [3] => Content-Length: 13889 [4] => Content-Type: text/html [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT [7] => Accept-Ranges: bytes [8] => ETag: "e8939ad2baf8cf1:693" [9] => Server: IIS [10] => X-Powered-By: WAF/2.0 [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/)
$url = 'http://www.mmkao.com/Beautyleg/';print_r(get_headers($url));
Array( [0] => HTTP/1.1 200 OK [1] => Connection: close [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT [3] => Content-Length: 13889 [4] => Content-Type: text/html [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT [7] => Accept-Ranges: bytes [8] => ETag: "e8939ad2baf8cf1:693" [9] => Server: IIS [10] => X-Powered-By: WAF/2.0 [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/)
GET /Beautyleg/ HTTP/1.1
Host:www.mmkao.com
Connection:keep-alive
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language:zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding:gzip, deflate
HTTP/1.1 200 OKDate: Wed, 05 Nov 2014 09:34:02 GMTContent-Length: 13889Content-Type: text/htmlContent-Location: http://www.mmkao.com/Beautyleg/index.htmlLast-Modified: Wed, 05 Nov 2014 05:39:09 GMTAccept-Ranges: bytesETag: "e8939ad2baf8cf1:693"Server: IISX-Powered-By: WAF/2.0Set-Cookie: safedog-flow-item=BFC86EA38C3E0337FB45DCE403130335; expires=Sat, 12-Dec-2150 11:07:18 GMT; domain=mmkao.com; path=/<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=gb2312" /><title>Beautyleg_咪咪图秀</title><meta name="keywords" content="Beautyleg"><meta name="description" content="Beautyleg"><style type="text/css">a { color: #993399; text-decoration:none; }body { margin:10px; color: #993399; text-align:center; font-size:12px; font-family:宋体; }div { margin:auto; }ul { margin:0px; padding:0px; }li {list-style:none;}img{ border:0px; vertical-align:middle; }.dh li{ float:left; width:108px; height:39px; line-height:39px; background-image:url(../Image2/list_bg.gif); background-repeat:no-repeat; background-position:106px 4px; }.dh a:hover { text-decoration:underline;}.dh a{ color:#ffffff; font-weight:bold; }.dh li.vis a{color:#FFFFFF; font-weight:bolder;}.lm { width:980px; }.lm .left { float:left; line-height:38px; font-size:14px; color:#CC0066; font-weight:bold; padding-left:40px;}.lm .right { float:right; line-height:38px; font-size:14px; color:#CC0066; font-weight:bold; padding-right:10px; }.lm .new li{ float:left; text-align:left; padding:7px; }.lm .new li span.l{ float:left;}.lm .new li span.r{ float:right; padding-left:5px; }.lm .list li{ width:215px; float:left; text-align:left; padding:7px; border-bottom:#ddd 1px dashed;}.lm .list li span.l1,.l2 { display:block; float:left; width:9px; height:9px; text-align:center; color:#fff;}.lm .list li span.l1{ background: url(Image2/icon1.gif);}.lm .list li span.l2{ background: url(Image/num.gif);}.lm .list li span.r{ float:left; padding-left:5px; }.lm .photo { padding:8px; margin:0px; width:964px; height:190px; }.lm .photo li{ float:left; width:160px; line-height:30px; border-bottom:#ddd 1px dashed; overflow:hidden; text-overflow:ellipsis;white-space:nowrap; }.lm .photo img{ width:120px; height:160px; border:#ddd 1px solid;}.lm .link { width:970px; margin-left:10px;}.lm .link li { float:left; width:240px; line-height:30px; text-align:left; border-bottom:#ddd 1px dashed;overflow:hidden; text-overflow:ellipsis;white-space:nowrap;}</style></head><body>............................ <ul class="page" style="max-width:90%"> 共 1035 组,每页 30 组,当前 1 / 35 页。<a href="./">首页</a> <a href="./">上一页</a> <a href="2.html">下一页</a> <a href="35.html">尾页</a>。 转到第 <select onchange="location.href = this.value"> <script language="javascript" type="text/javascript"> var Str = ""; for (i = 1; i <= 35 </ul> </div> <div style="height:10px; font-size:1px; background-image:url(../Image2/2.gif); background-repeat:no-repeat; background-position:0px -38px;"></div></div><div style="width:980px;" > <ul style="border:1px #CCCCCC solid; text-align:center; padding:5px;"> <script src='/g.js' language='JavaScript' charset='gb2312'></script> </ul></div><div style="width:980px;" > <ul style="border:1px #CCCCCC solid; text-align:center; padding:5px;"> <script src='/d.js' language='JavaScript' charset='gb2312'></script> </ul></div><div class="lm" style="background: url(../Image2/bottom.gif); background-repeat:no-repeat; background-position:0px 0px; height:100px; margin-top:8px; line-height:100px;"> <ul> <li><a href="http://www.mmkao.com/">咪咪图秀</a> www.mmkao.com <script src='/tj.js' language='JavaScript' charset='gb2312'></script></li> </ul></div></body></html>
$url = 'http://www.mmkao.com/Beautyleg/';print_r(get_headers($url));
Array( [0] => HTTP/1.1 200 OK [1] => Connection: close [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT [3] => Content-Length: 13889 [4] => Content-Type: text/html [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT [7] => Accept-Ranges: bytes [8] => ETag: "e8939ad2baf8cf1:693" [9] => Server: IIS [10] => X-Powered-By: WAF/2.0 [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/)