Crawler_movie FTP-Download-Adresse

WBOY
Freigeben: 2016-08-08 09:21:16
Original
3389 Leute haben es durchsucht

Site: http://www.dy2018.com/

Datenbank: mysql Konto: root Passwort: 123456

Tabellenanweisung erstellen: CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, URL varchar(2000) NOT NULL, Status tinyint(2) NOT NULL, PRIMARY KEY(id));

Code:

<?php
	declare(ticks = 1);
	pcntl_signal(SIGQUIT, &#39;signal_handler&#39;);
	pcntl_signal(SIGTERM, &#39;signal_handler&#39;);

	$crawlers_pid = array();
	$finish_count = 0;

	//信号处理函数
	function signal_handler($signal) 
	{
	    global $crawlers_pid;
	    if ($signal == SIGQUIT || $signal == SIGTERM) 
	    {
	        foreach ($crawlers_pid as $pid) {
	            posix_kill($pid,SIGTERM);
	        }
	        echo "---------- crawl task exit ----------";
	        global $con;//mysql
	        exit();
	    }
	}

	//GET方式获取链接对应页面内容
	function get_page_content($url) 
	{
		$content = file_get_contents($url);
		return $content;
	}

	//POST方式获取链接对应页面内容
	function get_page_content_by_post($url, $arr)
	{
		$arr = http_build_query($arr);
		$opts = array (
			&#39;http&#39; => array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data)
		);
		$context = stream_context_create($opts);
		$content = file_get_contents($url,false,$context);
		return $content;
	}

	//dy2018抓取主流程
	function run_dy2018() 
	{
		global $crawlers_pid;
		global $finish_count;
		$crawl_urls = array("http://www.dy2018.com/html/tv/hytv/",
		"http://www.dy2018.com/html/tv/hepai/",
		"http://www.dy2018.com/html/tv/gangtai/",
		"http://www.dy2018.com/html/tv/oumeitv/",
		"http://www.dy2018.com/html/tv/rihantv/",
		"http://www.dy2018.com/html/tv/tvzz/",
		"http://www.dy2018.com/0/",
		"http://www.dy2018.com/1/",
		"http://www.dy2018.com/2/",
		"http://www.dy2018.com/3/",
		"http://www.dy2018.com/4/",
		"http://www.dy2018.com/5/",
		"http://www.dy2018.com/6/",
		"http://www.dy2018.com/7/",
		"http://www.dy2018.com/8/",
		"http://www.dy2018.com/9/",
		"http://www.dy2018.com/10/",
		"http://www.dy2018.com/11/",
		"http://www.dy2018.com/12/",
		"http://www.dy2018.com/13/",
		"http://www.dy2018.com/14/",
		"http://www.dy2018.com/15/",
		"http://www.dy2018.com/16/",
		"http://www.dy2018.com/17/",
		"http://www.dy2018.com/18/",
		"http://www.dy2018.com/19/",
		"http://www.dy2018.com/20/");

		$i = 0;
		while($i < count($crawl_urls)) 
		{
			$pid = pcntl_fork();
			if($pid == -1) {
				echo "system error. check it now!";
				exit();
			} else if($pid > 0){
				$crawlers_pid[$i] = $pid;
			} else {
				$url = $crawl_urls[$i];
				$con = mysql_connect("localhost", "root", "123456");
				if(!$con) {
					die('Count not connect: '.mysql_error());
				}
				mysql_select_db("mysql", $con);
				crawl_process($url);
				$finish_count++;
			}
			$i++;
		}

		//pcntl_waitpid可能会导致信号监听失败
		while (true) { 
			if($finish_count == count($crawlers_pid)) {
				echo "---------- crawl task finish ----------";
				mysql_close();
				exit();
			}
            sleep(1);
        }

	}

	//从入口链接到其下所有下载页链接抓取过程
	function crawl_process($url)
	{
		echo "start handle url:".$url;
		$page_idx = 1;
		$valid_tag = true;
		$info_url_pattern = '/\/i\/\d+.html/';
		$ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用
		while($valid_tag) {
			$page_url = get_page_index_url($url, $page_idx);
			printf("start crawl url:".$page_url."\n");
			$page_content = get_page_content($page_url);
			$valid_tag = is_valid_page($page_content);
			if($valid_tag) {
				$matches_urls = array();
				preg_match_all($info_url_pattern, $page_content, $matches_urls);
				$page_content = mb_convert_encoding($page_content, "UTF-8", "GBK");
				for($i=0; $i<count($matches_urls[0]); $i++) {
					$detail_url = &#39;http://www.dy2018.com&#39;.$matches_urls[0][$i];
					$detail_page_content = get_page_content($detail_url);
					$detail_page_content = mb_convert_encoding($detail_page_content, "UTF-8", "GBK");
					preg_match_all($ftp_url_pattern, $detail_page_content, $ftp_urls);
					$ftp_links = array();
					for($j=0;$j<count($ftp_urls[0]); $j++) {

						$ftp_links[$j] = $ftp_urls[0][$j];
					}
					$ftp_links_unique = array_values(array_unique($ftp_links));

					foreach ($ftp_links_unique as $ftp_link) {
						mysql_query("insert into dy2018_url (url, status) values(&#39;$ftp_link&#39;,&#39;0&#39;)");
						// echo mysql_error();//打印mysql错误
					}
					sleep(1);
				}
			}
			$page_idx++;
		}
	}

	//获取页码对应的url链接
	function get_page_index_url($url, $idx) 
	{
		$idx_url = $url;
		if($idx == 1) {
			$idx_url = $idx_url.&#39;index.html&#39;;
		} else if($idx > 1){
			$idx_url = $idx_url.'index_'.$idx.'.html';
		}
		return $idx_url;
	}

	//根据页面内容判断链接是否有效
	function is_valid_page($content)
	{
		return $content?true:false;
	}
	run_dy2018();
	mysql_close();
?>
Nach dem Login kopieren

Ergebnis:

Das Obige stellt die FTP-Download-Adresse crawler_movie einschließlich des relevanten Inhalts vor. Ich hoffe, dass es für Freunde hilfreich ist, die sich für PHP-Tutorials interessieren.

Verwandte Etiketten:
Quelle:php.cn
Erklärung dieser Website
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn
Beliebte Tutorials
Mehr>
Neueste Downloads
Mehr>
Web-Effekte
Quellcode der Website
Website-Materialien
Frontend-Vorlage