목차
自定义HTTP抓包和过滤
Jun 23, 2016 pm 01:44 PM 定义一个http抓包类,发送数据到一个自定义的接受脚本,可以发送成功,并收取数据,但是发送到外网,却不行,分析过在浏览器下发送HTTP请求时的request header 信息,通过模拟请求,但超时...
//定义一个HTTP抓包类,其实也可以用curl。。。。。<?php ini_set('error_reporting', E_ALL);class Httpwrap{ private $hostInfo=null; private $requestLine=null; private $requestHeader=null; private $emptyLine="\r\n"; private $requestBody=null; private $requestEntity=null; private $responseEntity=null; private $responseHeader=null; private $responseBody=null; private $emptyLinePos=null; private $connect=null; private $errNo=null; private $errStr=null; public function __construct($url) { $this->hostInfo=parse_url($url); $this->setRequestHeader(array('Host' => $this->hostInfo['host'])); $this->setRequestHeader(array('Connection' => 'keep-alive')); } //设置HTTP请求行信息,例如: GET /resources HTTP/1.1 //但为了避免漏掉url中?开始的查询信息,有必要进行判断 public function setRequestLine($method) { //如果是POST请求,则自动添加content-type头信息 if(strtolower($method)=='post') { $this->setRequestHeader(array('Content-type' => 'application/x-www-form-urlencoded')); } if(!empty($this->hostInfo['query'])) { $this->requestLine=strtoupper($method)." ".$this->hostInfo['path']."?".$this->hostInfo['query']." HTTP/1.1 \r\n"; } else { $this->requestLine=strtoupper($method)." ".$this->hostInfo['path']." HTTP/1.1 \r\n"; } } //设置HTTP请求头。 //接收参数是数组类型,通过迭代拼接key:value,并换行 public function setRequestHeader($header) { foreach($header as $key => $value) { $this->requestHeader .=$key.":".$value."\r\n"; } } //设置HTTP请求体 //接收参数是数组类型,通过迭代拼接key=value,因为最后一席拼接会有一个多余的&,所以有必要去掉 public function setRequestBody($body) { foreach($body as $key => $value) { $this->requestBody .=$key.'='.$value.'&'; } $offset=strrpos($this->requestBody, '&'); $this->requestBody=substr($this->requestBody, 0, $offset); } //组装 请求行+请求头+请求体,并根据请求体的长度,自动填充请求头的content-length字段 public function setRequestEntity() { if(!empty($this->requestBody)) { $contentLength=strlen($this->requestBody); $this->setRequestHeader(array('Content-length' => $contentLength)); $this->requestEntity=$this->requestLine.$this->requestHeader."\r\n".$this->requestBody; } else { $this->requestEntity=$this->requestLine.$this->requestHeader."\r\n"; } } //解析主机名的函数,暂时没有用上....... public function parseHost($url) { $pat='#http://([^/]+)#i'; if(preg_match($pat, $url, $match)) { return $match[1]; } else { echo '匹配主机信息失败<br />'; } } //创建到主机的连接 public function createConnect() { $this->connect=fsockopen($this->hostInfo['host'], 80, $this->errNo, $this->errStr) or die('连接主机失败'.$this->errStr); } //发送请求 public function sendRequest() { $this->setRequestEntity(); echo $this->requestEntity; exit(); $this->createConnect(); $entityLength=strlen($this->requestEntity); if($entityLength != fwrite($this->connect, $this->requestEntity, $entityLength)) { die('写入数据失败<br />'); } else { $this->receiveResponse(); } } //接受请求,并依次拼接响应体 public function receiveResponse() { while(!feof($this->connect)) { $this->responseEntity .= fread($this->connect, 1024); } } //计算响应头与响应体之间的空行的位置 public function calculateEmptyLinePos() { $this->emptyLinePos=strpos($this->responseEntity,"\r\n\r\n",0); } //接受响应体的头部.... public function receiveResponseHeader() { $this->calculateEmptyLinePos(); $this->responseHeader=substr($this->responseEntity, 0, $this->emptyLinePos); echo $this->responseHeader; } //接收响应体的body部分 public function receiveResponseBody() { $this->calculateEmptyLinePos(); $this->responseBody=substr($this->responseEntity, $this->emptyLinePos); } //返回请求结果 public function getResponse() { return $this->responseEntity; } public function parseResponse() {} public function __destruct() { //fclose($this->connect); } }set_time_limit(60);$http=new Httpwrap("http://www.mmkao.com/Beautyleg/");//设置HTTP请求行$http->setRequestLine("get");//设置HTTP头$http->setRequestHeader(array("Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));$http->setRequestHeader(array("Accept-Language" => "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"));$http->setRequestHeader(array("Accept-Encoding" => "gzip, deflate"));$http->setRequestHeader(array("User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36"));//$http->setRequestHeader(array("Cookie" => "BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=6; AJSTAT_ok_pages=2; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712"));//发送数据$http->sendRequest();//$http->receiveResponseHeader();?>通过这个类给领一个自定义的脚本,可以发送和接收数据,领一个脚本如下:<?php if(!empty($_POST)){ $str=implode(',',$_POST); file_put_contents('./post.txt', $str,FILE_APPEND); echo $str;}?>但是给这个网站发送请求时,却超时:网站是:http://www.mmkao.com/Beautyleg/通过chrome给这个网站首页发送请求时的header头信息:Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8Accept-Encoding:gzip,deflate,sdchAccept-Language:zh,en;q=0.8,zh-TW;q=0.6,zh-CN;q=0.4,ja;q=0.2Cache-Control:max-age=0Connection:keep-aliveCookie:BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=7; AJSTAT_ok_pages=3; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712DNT:1Host:www.mmkao.comUser-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36Response Headersview source//通过相同的包装,并调用Httpwrap发送请求时,却提示超时,是在不知道哪里出问题........针对这个网站写了一个过滤出图片链接的类:<?php class Parseimage{ private $responseBody=null; private $imgLink=null; private $pageNum=null; private header=null; private body=null; public function __construct($body) { $this->responseBody=$body; } //匹配图片src开始的链接地址 public function feedImage() { $pat='#<img src="/static/imghw/default1.png" data-src="(.*?)(?=" class="lazy" (.*?))#i'; if(preg_match_all($pat, $body, $match)) { foreach($match[2] as $link) { $this- alt="自定义HTTP抓包和过滤" >imgLink[]=$link; } } else { echo '匹配失败图片链接地址失败'."<br />"; } } //提取head部分 public function filterHeader($body) { $pat='#<head>[\s\S]+</head>#im'; if(preg_match($pat, $body, $match)) { $this->header=$match[0]; } else { echo '匹配head部分失败'."<br />"; } } //提取body部分 public function filterBody($body) { $pat='#<body>[\s\S]+</body>#im'; if(preg_match($pat, $body, $match)) { $this->body=$match[0]; } else { echo '匹配body部分失败'."<br />"; } } //提取分页信息,这个只能针对性的匹配,不能通用 public function rollPage($body) { $pat='#[\x{4e00}-\x{9fa5}]+\s*\d\s+?/\s+?\d+\s*[\x{4e00}-\x{9fa5}]*#ui'; if(preg_match($pat, $body, $match)) { $patNum='#/\s*(\d\d*)#'; if(preg_match($patNum, $match[0], $num)) { $this->pageNum=$num[1]; } else { echo '提取分页具体值失败'."<br />"; } } else { echo '提取分页统计失败'."<br />"; } }?>附注: 这两个类,,都通过了内网的测试,并成功过滤出图片的链接,但是给http://www.mmkao.com/Beautyleg/发送请求时,却提示超时,,不知道哪里出了问题。。。。。。
로그인 후 복사
回复讨论(解决方案)
在window的命令行下,提交请求,是可以收到数据的......
可以收到数据
$url = 'http://www.mmkao.com/Beautyleg/';print_r(get_headers($url));
로그인 후 복사
로그인 후 복사
로그인 후 복사
Array( [0] => HTTP/1.1 200 OK [1] => Connection: close [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT [3] => Content-Length: 13889 [4] => Content-Type: text/html [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT [7] => Accept-Ranges: bytes [8] => ETag: "e8939ad2baf8cf1:693" [9] => Server: IIS [10] => X-Powered-By: WAF/2.0 [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/)
로그인 후 복사
로그인 후 복사
로그인 후 복사
$url = 'http://www.mmkao.com/Beautyleg/';print_r(get_headers($url));
로그인 후 복사
로그인 후 복사
로그인 후 복사
Array( [0] => HTTP/1.1 200 OK [1] => Connection: close [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT [3] => Content-Length: 13889 [4] => Content-Type: text/html [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT [7] => Accept-Ranges: bytes [8] => ETag: "e8939ad2baf8cf1:693" [9] => Server: IIS [10] => X-Powered-By: WAF/2.0 [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/)
로그인 후 복사
로그인 후 복사
로그인 후 복사
这样确实可以。蛋疼,我再看看
GET /Beautyleg/ HTTP/1.1
Host:www.mmkao.com
Connection:keep-alive
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language:zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
Accept-Encoding:gzip, deflate
HTTP/1.1 200 OKDate: Wed, 05 Nov 2014 09:34:02 GMTContent-Length: 13889Content-Type: text/htmlContent-Location: http://www.mmkao.com/Beautyleg/index.htmlLast-Modified: Wed, 05 Nov 2014 05:39:09 GMTAccept-Ranges: bytesETag: "e8939ad2baf8cf1:693"Server: IISX-Powered-By: WAF/2.0Set-Cookie: safedog-flow-item=BFC86EA38C3E0337FB45DCE403130335; expires=Sat, 12-Dec-2150 11:07:18 GMT; domain=mmkao.com; path=/<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=gb2312" /><title>Beautyleg_咪咪图秀</title><meta name="keywords" content="Beautyleg"><meta name="description" content="Beautyleg"><style type="text/css">a { color: #993399; text-decoration:none; }body { margin:10px; color: #993399; text-align:center; font-size:12px; font-family:宋体; }div { margin:auto; }ul { margin:0px; padding:0px; }li {list-style:none;}img{ border:0px; vertical-align:middle; }.dh li{ float:left; width:108px; height:39px; line-height:39px; background-image:url(../Image2/list_bg.gif); background-repeat:no-repeat; background-position:106px 4px; }.dh a:hover { text-decoration:underline;}.dh a{ color:#ffffff; font-weight:bold; }.dh li.vis a{color:#FFFFFF; font-weight:bolder;}.lm { width:980px; }.lm .left { float:left; line-height:38px; font-size:14px; color:#CC0066; font-weight:bold; padding-left:40px;}.lm .right { float:right; line-height:38px; font-size:14px; color:#CC0066; font-weight:bold; padding-right:10px; }.lm .new li{ float:left; text-align:left; padding:7px; }.lm .new li span.l{ float:left;}.lm .new li span.r{ float:right; padding-left:5px; }.lm .list li{ width:215px; float:left; text-align:left; padding:7px; border-bottom:#ddd 1px dashed;}.lm .list li span.l1,.l2 { display:block; float:left; width:9px; height:9px; text-align:center; color:#fff;}.lm .list li span.l1{ background: url(Image2/icon1.gif);}.lm .list li span.l2{ background: url(Image/num.gif);}.lm .list li span.r{ float:left; padding-left:5px; }.lm .photo { padding:8px; margin:0px; width:964px; height:190px; }.lm .photo li{ float:left; width:160px; line-height:30px; border-bottom:#ddd 1px dashed; overflow:hidden; text-overflow:ellipsis;white-space:nowrap; }.lm .photo img{ width:120px; height:160px; border:#ddd 1px solid;}.lm .link { width:970px; margin-left:10px;}.lm .link li { float:left; width:240px; line-height:30px; text-align:left; border-bottom:#ddd 1px dashed;overflow:hidden; text-overflow:ellipsis;white-space:nowrap;}</style></head><body>............................ <ul class="page" style="max-width:90%"> 共 1035 组,每页 30 组,当前 1 / 35 页。<a href="./">首页</a> <a href="./">上一页</a> <a href="2.html">下一页</a> <a href="35.html">尾页</a>。 转到第 <select onchange="location.href = this.value"> <script language="javascript" type="text/javascript"> var Str = ""; for (i = 1; i <= 35 </ul> </div> <div style="height:10px; font-size:1px; background-image:url(../Image2/2.gif); background-repeat:no-repeat; background-position:0px -38px;"></div></div><div style="width:980px;" > <ul style="border:1px #CCCCCC solid; text-align:center; padding:5px;"> <script src='/g.js' language='JavaScript' charset='gb2312'></script> </ul></div><div style="width:980px;" > <ul style="border:1px #CCCCCC solid; text-align:center; padding:5px;"> <script src='/d.js' language='JavaScript' charset='gb2312'></script> </ul></div><div class="lm" style="background: url(../Image2/bottom.gif); background-repeat:no-repeat; background-position:0px 0px; height:100px; margin-top:8px; line-height:100px;"> <ul> <li><a href="http://www.mmkao.com/">咪咪图秀</a> www.mmkao.com <script src='/tj.js' language='JavaScript' charset='gb2312'></script></li> </ul></div></body></html>
로그인 후 복사
这个截图是我通过Httpwrap发送的请求头的信息,复制到telnent上也能完整获取网页
$url = 'http://www.mmkao.com/Beautyleg/';print_r(get_headers($url));
로그인 후 복사
로그인 후 복사
로그인 후 복사
Array( [0] => HTTP/1.1 200 OK [1] => Connection: close [2] => Date: Wed, 05 Nov 2014 08:53:09 GMT [3] => Content-Length: 13889 [4] => Content-Type: text/html [5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html [6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT [7] => Accept-Ranges: bytes [8] => ETag: "e8939ad2baf8cf1:693" [9] => Server: IIS [10] => X-Powered-By: WAF/2.0 [11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/)
로그인 후 복사
로그인 후 복사
로그인 후 복사
这个问题遗留了好久,今天花时间解决了,而且可以整站采集,,,,,
http://blog.csdn.net/free_program_1314/article/details/41780835
본 웹사이트의 성명
본 글의 내용은 네티즌들의 자발적인 기여로 작성되었으며, 저작권은 원저작자에게 있습니다. 본 사이트는 이에 상응하는 법적 책임을 지지 않습니다. 표절이나 침해가 의심되는 콘텐츠를 발견한 경우 admin@php.cn으로 문의하세요.

인기 기사
Repo : 팀원을 부활시키는 방법
3 몇 주 전
By 尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. 에너지 결정과 그들이하는 일 (노란색 크리스탈)
1 몇 주 전
By 尊渡假赌尊渡假赌尊渡假赌
스플릿 소설을이기는 데 얼마나 걸립니까?
3 몇 주 전
By DDD
헬로 키티 아일랜드 어드벤처 : 거대한 씨앗을 얻는 방법
3 몇 주 전
By 尊渡假赌尊渡假赌尊渡假赌

인기 기사
Repo : 팀원을 부활시키는 방법
3 몇 주 전
By 尊渡假赌尊渡假赌尊渡假赌
R.E.P.O. 에너지 결정과 그들이하는 일 (노란색 크리스탈)
1 몇 주 전
By 尊渡假赌尊渡假赌尊渡假赌
스플릿 소설을이기는 데 얼마나 걸립니까?
3 몇 주 전
By DDD
헬로 키티 아일랜드 어드벤처 : 거대한 씨앗을 얻는 방법
3 몇 주 전
By 尊渡假赌尊渡假赌尊渡假赌

뜨거운 기사 태그

메모장++7.3.1
사용하기 쉬운 무료 코드 편집기

SublimeText3 중국어 버전
중국어 버전, 사용하기 매우 쉽습니다.

스튜디오 13.0.1 보내기
강력한 PHP 통합 개발 환경

드림위버 CS6
시각적 웹 개발 도구

SublimeText3 Mac 버전
신 수준의 코드 편집 소프트웨어(SublimeText3)

뜨거운 주제
Gmail 이메일의 로그인 입구는 어디에 있나요?
7296
9


자바 튜토리얼
1622
14


Cakephp 튜토리얼
1342
46


라라벨 튜토리얼
1259
25


PHP 튜토리얼
1206
29



Laravel Back End : Part 2, React가있는 React 앱 구축

PHP의 컬 : REST API에서 PHP Curl Extension 사용 방법
