A php spider crawler using curl and regular expressions

WBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWBOYWB

Release： 2016-07-25 09:08:53

Original

1251 people have browsed it

Fengwang fcms content management system
get.php crawling framework, analyzes and processes web content and performs relevant replacements
std.php general regular expression
news_67_com.php crawler analyzer for http://news.67.com
Grab the list first, then the content page.
It also lacks monitoring, statistics, and error handling functions. Personally I think it's more fun.

include_once dirname(__FILE__) . '/std.php';
$site = array(
'aname' => '中国娱乐网',
'domain' => 'news.67.com',
'dirname' => '目录名称，用于匹配基于目录不同的正文',
'gettype' => 'default',
//获取主文件
'creg' => '/(?si)(.*?)\<\!--文章 end-->/',
'code' => 'utf-8',
'sub' => '获取子目录正则',
'content' => 'tag1',
'img_upload'=> array('tag1' => ''),
//下一页
'reg_next' => '/(?is)下一页\>\><\/a>/',

'key0' => '/(?is)/',

'key0_ap' => array(array(',', '|'), ' '),

'tag0' => '/(?is)

([^<^>]*?)<\/h1>/',

'', '', '', '', '', '',

.*?(.*?)<\/p>\s*/',

\s*/',

\s*/',

\s*/',

\s*/',

'/(?is)\s*<\/center>\s*/',

\s*/',

', '', '', '

', '

'tag1_arp' => array(

array(

'/(?is)

<\/p>/',

'/(?is)<\/strong>/'

),

array(

'', ''

),

),

'strip' => array('tag1' => ''),

'tag2' => '/(?is)
导读：\s*(.*?)\s*<\/div>/',

'tag2_arp' => array(

array(

'/(?is)　/'

),

array(

''

),

),

'tag3' => '/(?is)(中国娱乐网)/',

'tag4' => '/(?is)
日期：(\d+-\d+-\d+ \d+:\d+:\d+).*?<\/div>/',

);

$map = array(

'tag' => 'key0',

'title' => 'tag0',

'content' => 'tag1',

'summary' => 'tag2',

'source' => 'tag3',

'pub_date' => 'tag4',

);

$site_list = array(

'aname' => '中国娱乐网',

'domain' => 'www.67.com',

'gettype' => 'default',

'creg' => '/(?si)
(.*?)
/',

'code' => 'gbk',

'reg_next' => '/(?si)

下一页<\/a><\/li>/',

//链接

'tag0' => '/(?is)
.*?[^<^>]*?<\/a>.*?<\/div>/',

//标题

'tag1' => '/(?is)
.*?([^<^>]*?).*?
/',

'tag1_arp' => array(

array(

'/(?is)(组图)/',

'/(?is)(图)/',

'/(?is)(图../',

'/(?is)(组图../',

'/(?is)./',

'/(?is)(《|》)/',

),

array(

'', '', '', '', '', '',

)

),

);

$list_map = array(

'url' => 'tag0',

'title' => 'tag1',

);

$site_list_sub = array();

复制代码

global $std;
$std = array(
'url' => '[0-9a-zA-Z.:-/%_#;&]+',

'img' => '/(?is)/',

);

复制代码

/**
* test.php
*
* @author xzfred

* @copyright 2009 fengone.com

* @created 2010-12-07 .

* @version $Id: php.php 3 2008-10-10 07:49:21Z fred $

* SVNPath $HeadURL: http://192.168.0.16/svn/vim/skeletons/php.php $

*/

/*

include_once "std.php";

include_once "lady_163_com.php";

*/

include_once $GLOBALS['g_dir_core'] . "get.php";

//================================================================================

include_once DIR_HOST_TAG . '/tuku_ent_china_com.php';

$obj = new FcHtmlParse($site);

$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/2011-08-23/181703.xml"));

echo "nnn ===================n";

echo $c['field']['tag1'][0];

echo "nnn ===================n";

var_dump($c);

exit();

//列表测试

$obj = new FcHtmlParse($site_list);

$c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/3569_1.html"));

var_dump($c);

exit();

/*

$obj = new FcHtmlGet($site);

$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');

var_dump($c);

$obj = new FcHtmlGet($site);

$c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');

var_dump($c);

$obj = new FcHtmlParse($site);

$img_obj = new FcHtmlImgUpload($site);

$data = file_get_contents("e:/b.html");

$c = $obj->parse($data);

$ic = $img_obj->upload($c['tag']['tag1'][0]);

var_dump($ic);

$data = file_get_contents("e:/a.html");

$c = $obj->parse($data);

$ic = $img_obj->upload($c['tag']['tag1'][0]);

var_dump($ic);

*/

//var_dump($c['tag']['tag1']);

复制代码

include_once $GLOBALS['g_dir_core'] . 'host/std.php';
$site = array(
'aname' => '中华网娱乐图库',

'domain' => 'tuku.ent.china.com',

'dirname' => '目录名称，用于匹配基于目录不同的正文',

'gettype' => 'default',

//获取主文件

'creg' => '/(?si)(.*?<\/list>)/',

'code' => 'utf-8',

'sub' => '获取子目录正则',

'content' => 'tag1',

'img_upload'=> array('tag1' => ''),

//下一页

'reg_next' => '/(?is)下一页<\/a>/',

'key0' => '/(?is)/',

'key0_ap' => array(array(',', '|'), ' '),

'tag0' => '/(?is)title="([^"]*?)"/',

'tag0_arp' => array(

array(

'/(?is)$图$/',

'/(?is)\"/',

'/(?is)独家：/',

'/(?is)独家:/',

'/(?is)(《|》)/',

),

array(

'', '', '', '', '',

)

),

'tag1' => '/(?is)(.*?)<\/list>/',

'tag1_brp' => array(

array(

'/(?is)\s*\s*/'

),

array(

'

$1

$3
'

)

),

'tag1_arp' => array(

array(

'/(?is)
([^<^>]*?)<\/p>/',

'/(?is)\<br\/\>/',

),

array(

'
$1

',

'',

)

),

'strip' => array('tag1' => ''),

//网名

'tag3' => '/(?is)([^<^>]*?)<\/span>/',

'tag4' => '/(?is)(中华网)/'

);

$map = array(

'tag' => 'key0',

'title' => 'tag0',

'content' => 'tag1',

'author' => 'tag3',

'source' => 'tag4'

);

$site_list = array(

'aname' => '中华网娱乐图库',

'domain' => 'tuku.ent.china.com',

'gettype' => 'default',

'creg' => '/(?si)
(.*?)
<\/div>/',

'code' => 'utf-8',

'reg_next' => '/(?si)
下一页<\/a><\/li>/',

//链接

'tag0' => '/(?is)
.*?<\/span>[^<^>]*?<\/a><\/div>/',

'tag0_brp' => array(

array(

'/(?is)\.htm/',

),

array(

'.xml'

)

),

//标题

'tag1' => '/(?is)
.*?<\/span>([^<^>]*?)<\/a><\/div>/',

'tag1_arp' => array(

array(

'/(?is)$图$/',

'/(?is)\"/',

'/(?is)独家：/',

'/(?is)独家:/',

'/(?is)(《|》)/',

),

array(

'', '', '', '', '',

)

),

);

$list_map = array(

'url' => 'tag0',

'title' => 'tag1',

);

$site_list_sub = array();

复制代码