This is a flexible multi-threaded call to cURL. This is different from the sample provided in the PHP manual http://us2.php.net/manual/zh/function.curl-multi-select.php. The code execution efficiency is much higher There are two files here, one It is a file of muti_curl, which contains two classes 一个是运用的方法,这里是批量检查代理ip是否可用
- class request_setting {
- public $url = false;
- public $method = 'GET';
- public $post_data = null;
- public $headers = null;
- public $options = null;
- function __construct($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
- $this->url = $url;
- $this->method = $method;
- $this->post_data = $post_data;
- $this->headers = $headers;
- $this->options = $options;
- }
- public function __destruct() {
- unset($this->url, $this->method, $this->post_data, $this->headers, $this->options);
- }
- }
- /***************************************************** *******************************************
- Classes for batch operations
- * *************************************************** ****************************************/
- class muti_curl {
- protected $thread_size = 100;
- protected $timeout = 30;
- private $callback;
- protected $options = array(
- CURLOPT_SSL_VERIFYPEER => false,//禁用后cURL将终止从服务端进行验证。使用CURLOPT_CAINFO选项设置证书使用CURLOPT_CAPATH选项设置证书目录 如果CURLOPT_SSL_VERIFYPEER(默认值为2)被启用,CURLOPT_SSL_VERIFYHOST需要被设置成TRUE否则设置为FALSE。 自cURL 7.10开始默认为TRUE。从cURL 7.10开始默认绑定安装。
- CURLOPT_RETURNTRANSFER => true, //将 curl_exec()获取的信息以文件流的形式返回,而不是直接输
- CURLOPT_CONNECTTIMEOUT => 15,
- CURLOPT_TIMEOUT => 30,
- // CURLOPT_HTTP_VERSION=>CURL_HTTP_VERSION_1_0, //使用代理的时候用这个去抓取数据,更爽
- // CURLOPT_AUTOREFERER=>false,// 当根据Location:重定向时,自动设置header中的Referer:信息。
- // CURLOPT_BINARYTRANSFER=>false, //在启用CURLOPT_RETURNTRANSFER的时候,返回原生的(Raw)输出这个不用设置。
- // CURLOPT_COOKIESESSION=>true,// 启用时curl会仅仅传递一个session cookie,忽略其他的cookie,默认状况下cURL会将所有的cookie返回
- // CURLOPT_CRLF=>false,// 启用时将Unix的换行符转换成回车换行符。
- // CURLOPT_DNS_USE_GLOBAL_CACHE=>false, // 启用时会启用一个全局的DNS缓存,此项为线程安全的,并且默认启用。
- // CURLOPT_FAILONERROR=>false, // 显示HTTP状态码,默认行为是忽略编号小于等于400的HTTP信息。
- // CURLOPT_FILETIME=>true, //启用时会尝试修改远程文档中的信息。结果信息会通过 curl_getinfo()函数的CURLINFO_FILETIME选项返回。 curl_getinfo().
- // CURLOPT_FOLLOWLOCATION=>false, // 启用时会将服务器服务器返回的"Location: "放在header中递归的返回给服务器,使用CURLOPT_MAXREDIRS可以限定递归返回的数量。
- // CURLOPT_FORBID_REUSE=>true, //在完成交互以后强迫断开连接,不能重用。
- // CURLOPT_FRESH_CONNECT=>true,// 强制获取一个新的连接,替代缓存中的连接。
- // CURLOPT_FTP_USE_EPRT=>false,// 启用时当FTP下载时,使用EPRT (或 LPRT)命令。设置为FALSE时禁用EPRT和LPRT,使用PORT命令 only.
- // CURLOPT_FTP_USE_EPSV=>false,// 启用时,在FTP传输过程中回复到PASV模式前首先尝试EPSV命令。设置为FALSE时禁用EPSV命令。
- // CURLOPT_FTPAPPEND=>false,// 启用时追加写入文件而不是覆盖它。
- // CURLOPT_FTPASCII=>false,// CURLOPT_TRANSFERTEXT的别名。
- // CURLOPT_FTPLISTONLY=>false,// 启用时只列出FTP目录的名字。
- // CURLOPT_HEADER=>true,// 启用时会将头文件的信息作为数据流输出。
- // CURLINFO_HEADER_OUT=>false, //启用时追踪句柄的请求字符串。
- // CURLOPT_HTTPGET=>true,// 启用时会设置HTTP的method为GET,因为GET是默认是,所以只在被修改的情况下使用。
- // CURLOPT_HTTPPROXYTUNNEL =>true,// 启用时会通过HTTP代理来传输。
- // CURLOPT_MUTE=>true,// 启用时将cURL函数中所有修改过的参数恢复默认值。
- // CURLOPT_NETRC=>false,// 在连接建立以后,访问~/.netrc文件获取用户名和密码信息连接远程站点。
- // CURLOPT_NOBODY=>true, 启用时将不对HTML中的BODY部分进行输出。
- // CURLOPT_NOPROGRESS=>false,//启用时关闭curl传输的进度条,此项的默认设置为启用。
- // CURLOPT_NOSIGNAL=>false,// 启用时忽略所有的curl传递给php进行的信号。This item is enabled by default during SAPI multi-threaded transmission. Added in cURL 7.10.
- // CURLOPT_POST=>false,// When enabled, a regular POST request will be sent, type: application/x-www-form-urlencoded, just like form submission.
- // CURLOPT_PUT=>false,// Allow HTTP to send files when enabled, CURLOPT_INFILE and CURLOPT_INFILESIZE must be set at the same time.
- // CURLOPT_TRANSFERTEXT=>false,// When enabled, use ASCII mode for FTP transmission. For LDAP, it retrieves plain text information rather than HTML. On Windows systems, the system does not set STDOUT to binary mode.
- // CURLOPT_UNRESTRICTED_AUTH=>true,// Continuously append username and password information to multiple locations in the header generated using CURLOPT_FOLLOWLOCATION, even if the domain name has changed.
- // CURLOPT_UPLOAD=>false,// Allow file upload when enabled.
- // CURLOPT_VERBOSE =>true,// When enabled, all information will be reported and stored in STDERR or the specified CURLOPT_STDERR.
- );
- private $headers = array();
- private $requests = array();
- private $requestMap = array();
- /*********************
- Construct a callback function
- ********************/
- function __construct($callback = null) {
- $this->callback = $callback;
- }
-
- /***************************************************** ******************
- Overloading the __get method
- **************************** *******************************************/
- public function __get($name) {
- return (isset($this->{$name})) ? $this->{$name} : null;
- }
-
- /***************************************************** *******************
- Overloading the __set method
- ************************* ******************************/
- public function __set($name, $value) {
- // 增加一个设置到headers
- if ($name == "options" || $name == "headers") {
- $this->{$name} = $value + $this->{$name};
- } else {
- $this->{$name} = $value;
- }
- return true;
- }
- //增加一个请求
- public function add($request) {
- $this->requests[] = $request;
- return true;
- }
-
-
- public function request($url, $method = "GET", $post_data = null, $headers = null, $options = null) {
- $this->requests[] = new request_setting($url, $method, $post_data, $headers, $options);
- return true;
- }
-
- public function get($url, $headers = null, $options = null) {
- return $this->request($url, "GET", null, $headers, $options);
- }
-
- public function post($url, $post_data = null, $headers = null, $options = null) {
- return $this->request($url, "POST", $post_data, $headers, $options);
- }
-
- private function single_curl() {
- $ch = curl_init(); //初始化
- $request = array_shift($this->requests);//把第一个单元移出并作为结果
- $options = $this->get_options($request);//获得该单元的设置
- curl_setopt_array($ch, $options);//批设置
- $output = curl_exec($ch);
- $curl_info = curl_getinfo($ch);
-
- if ($this->callback) {
- $callback = $this->callback;
- if (is_callable($this->callback)) {
- call_user_func($callback, $output, $curl_info, $request);
- }
- }
- else
- return $output;
- return true;
- }
-
-
- private function rolling_curl($thread_size = null) {
- if ($thread_size){
- $this->thread_size = $thread_size;
- }
-
- if (count($this->requests) < $this->thread_size){
- $this->thread_size = count($this->requests);
- }
- if ($this->thread_size < 2) {
- $errorinfo = '线程大小必须大于 1!!!!';
- throw new Exception($errorinfo);
- }
- $queue = curl_multi_init();
-
- //在线程里开始增加任务队列
- for ($i = 0; $i < $this->thread_size; $i++) {
- $ch = curl_init();
- $options = $this->get_options($this->requests[$i]);
- curl_setopt_array($ch, $options);//获得设置
- curl_multi_add_handle($queue, $ch);//添加进去
- $key = (string) $ch;
- $this->requestMap[$key] = $i;
- }
- do {
- while (($statu_run_muti_exec = curl_multi_exec($queue, $active)) == CURLM_CALL_MULTI_PERFORM) ;
- if ($statu_run_muti_exec != CURLM_OK){ break; }
- // 发现完成的一个请求,进行处理
- while ($done = curl_multi_info_read($queue)) {
- $curl_info = curl_getinfo($done['handle']);
- $output = curl_multi_getcontent($done['handle']);
- $callback = $this->callback;
- if (is_callable($callback)){
- $key = (string) $done['handle'];
- $request = $this->requests[$this->requestMap[$key]];
- unset($this->requestMap[$key]);//这个销毁变量用得很帅
- call_user_func($callback, $output, $curl_info, $request);
- }
- //增加一个未处理的请求加入到一个已经完成的队列中
- if ($i < count($this->requests) && isset($this->requests[$i]) && $i < count($this->requests)) {
- $ch = curl_init();
- $options = $this->get_options($this->requests[$i]);
- curl_setopt_array($ch, $options);
- curl_multi_add_handle($queue, $ch);
- $key = (string) $ch;
- $this->requestMap[$key] = $i;
- $i++;
- }
- curl_multi_remove_handle($queue, $done['handle']);
- echo "done ";
- print_r($queue);
- print_r ($done);
- }
- // This step is very, very important if there is a completion , to reset the timeout setting
- //The key point here is to ensure that at least one request in all threads is valid for the first time, otherwise all the requests for the first time will have no effect, resulting in $ active=0, so the following is not executed
- if ($active >0){
- curl_multi_select($queue, $this->timeout);
- }
-
- } while ($active);
- curl_multi_close($queue );
- return true;
- }
- public function execute($thread_size = null) {
- //Judge the size of thread_size. If there is only one request, use single-threaded mode
- if (count($this->requests) == 1) {
- return $this->single_curl();
- } else {
- return $this->rolling_curl($thread_size);
- }
- }
- private function get_options($request) {
- $options = $this ->__get('options');
- if (ini_get('safe_mode') == 'Off' || !ini_get('safe_mode')) {
- // $options[CURLOPT_FOLLOWLOCATION] = 1;
- // $ options[CURLOPT_MAXREDIRS] = 5;
- }
- $headers = $this->__get('headers');
-
- if ($request->options) {
- $options = $request->options + $options ;
- }
- $options[CURLOPT_URL] = $request->url;
- //Set the post options and header options respectively below
- if ($request->post_data){
- $options[CURLOPT_POST] = 1;
- $options[CURLOPT_POSTFIELDS] = $request->post_data;
- }
- if ($headers) {
- $options[CURLOPT_HEADER] = 0;
- $options[CURLOPT_HTTPHEADER] = $headers;
- }
- return $options;
- }
-
- public function __destruct() {
- unset($this->thread_size, $this->callback, $this->options, $this->headers, $this->requests);
- }
- }
- ?>
Copy code
- header("content-type:text/html; charset=utf-8");
- require("muti_curl_class.php");
- set_time_limit(0);
- $sucesesnum=0;
- $good_proxy=array();
- function request_callback($response, $info, $request) {
- global $sucesesnum,$good_proxy;
- // The following regular expression can selectively display the returned results
- /* if ( preg_match("~(.*?)~i", $response, $out)) {
- $title = $out[1];
- }*/
- // echo '< ;br>'.$response .'
';
- echo '
';
- //Check the response, which is $response, to determine whether there are set characters in it, and if so, use the agent Success
- if( $response !== false && substr_count($response, 'User-agent: Baiduspider') >=1 ) {
- // $result = true;
- echo "true
";
- // echo $request[options][10004];
- // print_r ($request->options);
- echo $request->options[CURLOPT_PROXY];
- $good_proxy[]=$request->options[CURLOPT_PROXY] ;
- }
- echo '
the-->'. $sucesesnum.'<---use:'. $info['total_time']; - // print_r ($request);
- // echo $request->url;
- $sucesesnum++;
- echo "
";
- }
- $params = array_merge($_GET, $_POST); //Get the address of the passed proxy ip here
- $ result = $proxy_ip = trim($params['ip']);
- $timeout=intval(trim($params['timeout']));
- if($timeout<3 ){$timeout=3;}
- if($timeout>300){$timeout=300;}
- $thread_size=intval(trim($params['thread_size']));
- if($thread_size<5){$thread_size =5;}
- if( $thread_size>300){$thread_size =300;}
-
- if($proxy_ip == '') {
- echo 'Please enter IP!!';
- return;
- }
- $replace_arr1 = array(' ', 'qq proxy:', 'dn28.com', 'qqip', 'qq proxy', 'qq proxy ip', 'Agent ip:', 'ip:', 'Agent ip','"',"'" ,'\','/',' ');
- $result = str_replace($replace_arr1, array(''), $result);
- $result = str_replace(",", "n", $result);
- $resArr = explode("n", $result);
- foreach($resArr as $k => $v) {
- $posProxy = getPos($v, '@');
- if($posProxy== =false){
- if (!empty($v)){$proxyip_and_port = $v; }
- }else{
- $proxyip_and_port = substr($v, 0, $posProxy);
- }
- $newRes[] =trim ($proxyip_and_port);
- }
- print_r($newRes);
- //die();
- $option_setting = array(
- CURLOPT_SSL_VERIFYPEER => 0,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_CONNECTTIMEOUT => 5,
- CURLOPT_TIMEOUT => 30,
- CURLOPT_HEADER=>false,
- CURLOPT_PROXY=>'',//This is where the proxy is set
- );
- $url= 'http://www.baidu.com/robots.txt' ;
- $btime=time();
- $rc = new muti_curl("request_callback");
- $rc->timeout = $timeout;
- $rc->thread_size = $thread_size;
- foreach ($newRes as $ v) {
- $option_setting[CURLOPT_PROXY]=$v;
- $request = new request_setting($url, $method = "GET", $post_data = null,$header= null, $option_setting);
- $rc-> add($request);
- }
- $rc->execute();
- $etime=time();
- $usedtime=$etime-$btime;
- echo 'all'. $sucesesnum.'use'. $ usedtime;
- echo '
';
- $good_proxy= array_unique($good_proxy);
- $str='';
- foreach ($good_proxy as $v){
- $str.="'".trim($ v)."',";
- }
- $str= str_replace ( ' ' , '' ,$str );
- $str = preg_replace('/s+/', ' ', $str);
- echo $str. '
';
- var_export ($good_proxy);
- //var_dump ($good_proxy);
-
-
-
-
-
-
-
-
-
-
-
-
-
- //************* *************************************************** *************************************
- //********** *********************Only one function is used
-
- function parseProxyInfo ( $proxyStr ) {
- //$proxyStr = '202.115.207.25:80@HTTP; Sichuan Normal University, Chengdu, Sichuan Province';
- $posIp = getPos($proxyStr, ':');
- $ip = substr($proxyStr, 0, $posIp);
- $posPort = getPos($proxyStr, '@' );
- $port = substr($proxyStr, $posIp+1, $posPort-$posIp-1);
- $posType = getPos($proxyStr, ';');
- $type = substr($proxyStr, $posPort +1, $posType-$posPort-1);
- $location = substr(strstr($proxyStr, ';'), 1);
- return array(
- 'ip' => $ip,
- 'port' = > $port,
- 'type' => $type,
- 'location' => $location
- );
- }
-
- function getPos($haystack, $needle){
- return strpos($haystack, $needle );
- }
-
- function check_proxy_is_useful($model, $proxy_info_arr = array()) {
- global $params, $config;
- if($model == 'single') {
- $proxy_port = intval(trim($params['port']));
- $check_proxy_url = $config['verify_url'];
- $proxy_time_out = intval(trim($params['timeout']));
- $retry = intval(trim($params['retry']));
- $proxy_ip = trim($params['ip']);
- $proxy = new proxy( $proxy_ip, $proxy_port, $check_proxy_url, $proxy_time_out, $retry );
- //成功返回string success, 失败返回boolean false
- $result = $proxy -> check_proxy();
- //var_dump($result);
- $proxy_str_success = ''.$proxy_ip.':'.$proxy_port.'@'.'HTTP 代理验证成功!';
- $proxy_str_failed = ''.$proxy_ip.':'.$proxy_port.'@'.'HTTP 代理验证失败!';
- return $result !== false ? $proxy_str_success : $proxy_str_failed;
- } elseif ($model == 'collect') {
- $proxy_port = intval(trim($proxy_info_arr['port']));
- $check_proxy_url = $config['verify_url'];
- $proxy_time_out = intval(trim($params['timeout']));
- $retry = intval(trim($params['retry']));
- $proxy_ip = trim($proxy_info_arr['ip']);
- /*echo $proxy_ip.'
';
- echo $proxy_port.'
';
- echo $check_proxy_url.'
';
- echo $proxy_time_out.'
';
- echo $retry.'
';*/
- if(!isset($proxy)) {
- $proxy = new proxy( $proxy_ip, $proxy_port, $check_proxy_url, $proxy_time_out, $retry );
- }
- //成功返回string success, 失败返回boolean false
- $result = $proxy -> check_proxy();
- return $result;
- }
- }
-
- function get_single(){
- global $params, $config;
- $proxy_ip = trim($params['ip']);
- if($proxy_ip == '') {
- echo '请输入IP!!';
- return;
- }
- echo check_proxy_is_useful('single');
- }
-
- function get_proxy_by_collect(){
- global $params, $config;
- $params['url'] = trim($params['url']);
- if($params['url'] == '') {
- echo '请输入url!';
- return;
- }
- //$url = 'http://www.dn28.com/html/75/n-5175.html';
- $con = iconv('GBK', 'UTF-8', file_get_contents($params['url']));
- preg_match ('/
|