A php spider crawler using curl and regular expressions

WBOY
Release: 2016-07-25 09:08:53
Original
1139 people have browsed it
Fengwang fcms content management system
get.php crawling framework, analyzes and processes web content and performs relevant replacements
std.php general regular expression
news_67_com.php crawler analyzer for http://news.67.com
Grab the list first, then the content page.
It also lacks monitoring, statistics, and error handling functions. Personally I think it's more fun.
  1. include_once dirname(__FILE__) . '/std.php';
  2. $site = array(
  3. 'aname' => '中国娱乐网',
  4. 'domain' => 'news.67.com',
  5. 'dirname' => '目录名称,用于匹配基于目录不同的正文',
  6. 'gettype' => 'default',
  7. //获取主文件
  8. 'creg' => '/(?si)(.*?)\<\!--文章 end-->/',
  9. 'code' => 'utf-8',
  10. 'sub' => '获取子目录正则',
  11. 'content' => 'tag1',
  12. 'img_upload'=> array('tag1' => ''),
  13. //下一页
  14. 'reg_next' => '/(?is)下一页\>\><\/a>/',
  15. 'key0' => '/(?is)/',
  16. 'key0_ap' => array(array(',', '|'), ' '),
  17. 'tag0' => '/(?is)

    ([^<^>]*?)<\/h1>/',

  18. 'tag0_arp' => array(
  19. array(
  20. '/(?is)\(组图\)/',
  21. '/(?is)\(图\)/',
  22. '/(?is)\(图\.\./',
  23. '/(?is)\(组图\.\./',
  24. '/(?is)\./',
  25. '/(?is)(《|》)/',
  26. ),
  27. array(
  28. '', '', '', '', '', '',
  29. )
  30. ),
  31. 'tag1' => '/(?is)
    (.*?)[^/',
  32. 'tag1_brp' => array(
  33. array(
  34. '/(?is)(.*?)/',
  35. '/(?is)\(.*?\)/',
  36. '/(?is)\s*

    .*?(.*?)<\/p>\s*/',

  37. '/(?is)\s*

    \s*/',

  38. '/(?is)\s*

    \s*/',

  39. '/(?is) /',
  40. '/(?is)
    /',
  41. '/(?is)\s*

    \s*/',

  42. '/(?is)\s*

    \s*/',

  43. '/(?is)\s*
    \s*/',
  44. '/(?is)\s*<\/center>\s*/',
  45. '/(?is)\s*

    \s*/',

  46. ),
  47. array(
  48. '', '', '

    ', '

    ', '

    ', '', '', '

    ', '

    ', '

    ', '

    ', '

    '

  49. ),
  50. ),
  51. 'tag1_arp' => array(
  52. array(
  53. '/(?is)

    <\/p>/',

  54. '/(?is)<\/strong>/'
  55. ),
  56. array(
  57. '', ''
  58. ),
  59. ),
  60. 'strip' => array('tag1' => ''),
  61. 'tag2' => '/(?is)
    导读:\s*(.*?)\s*<\/div>/',
  62. 'tag2_arp' => array(
  63. array(
  64. '/(?is) /'
  65. ),
  66. array(
  67. ''
  68. ),
  69. ),
  70. 'tag3' => '/(?is)(中国娱乐网)/',
  71. 'tag4' => '/(?is)
    日期:(\d+-\d+-\d+ \d+:\d+:\d+).*?<\/div>/',
  72. );
  73. $map = array(
  74. 'tag' => 'key0',
  75. 'title' => 'tag0',
  76. 'content' => 'tag1',
  77. 'summary' => 'tag2',
  78. 'source' => 'tag3',
  79. 'pub_date' => 'tag4',
  80. );
  81. $site_list = array(
  82. 'aname' => '中国娱乐网',
  83. 'domain' => 'www.67.com',
  84. 'gettype' => 'default',
  85. 'creg' => '/(?si)
  86. 'code' => 'gbk',
  87. 'reg_next' => '/(?si)
  88. //链接
  89. 'tag0' => '/(?is)
  90. //标题
  91. 'tag1' => '/(?is)/',
  92. 'tag1_arp' => array(
  93. array(
  94. '/(?is)(组图)/',
  95. '/(?is)(图)/',
  96. '/(?is)(图../',
  97. '/(?is)(组图../',
  98. '/(?is)./',
  99. '/(?is)(《|》)/',
  100. ),
  101. array(
  102. '', '', '', '', '', '',
  103. )
  104. ),
  105. );
  106. $list_map = array(
  107. 'url' => 'tag0',
  108. 'title' => 'tag1',
  109. );
  110. $site_list_sub = array();
复制代码
  1. global $std;
  2. $std = array(
  3. 'url' => '[0-9a-zA-Z.:-/%_#;&]+',
  4. 'img' => '/(?is)/',
  5. );
复制代码
  1. /**
  2. * test.php
  3. *
  4. * @author xzfred
  5. * @copyright 2009 fengone.com
  6. * @created 2010-12-07 .
  7. * @version $Id: php.php 3 2008-10-10 07:49:21Z fred $
  8. * SVNPath $HeadURL: http://192.168.0.16/svn/vim/skeletons/php.php $
  9. */
  10. /*
  11. include_once "std.php";
  12. include_once "lady_163_com.php";
  13. */
  14. include_once $GLOBALS['g_dir_core'] . "get.php";
  15. //================================================================================
  16. include_once DIR_HOST_TAG . '/tuku_ent_china_com.php';
  17. $obj = new FcHtmlParse($site);
  18. $c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/2011-08-23/181703.xml"));
  19. echo "nnn ===================n";
  20. echo $c['field']['tag1'][0];
  21. echo "nnn ===================n";
  22. var_dump($c);
  23. exit();
  24. //列表测试
  25. $obj = new FcHtmlParse($site_list);
  26. $c = $obj->parse(file_get_contents("http://tuku.ent.china.com/fun/html/3569_1.html"));
  27. var_dump($c);
  28. exit();
  29. /*
  30. $obj = new FcHtmlGet($site);
  31. $c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
  32. var_dump($c);
  33. $obj = new FcHtmlGet($site);
  34. $c = $obj->getPage('http://star.pclady.com.cn/entertainment/ss/1106/703240.html');
  35. var_dump($c);
  36. $obj = new FcHtmlParse($site);
  37. $img_obj = new FcHtmlImgUpload($site);
  38. $data = file_get_contents("e:/b.html");
  39. $c = $obj->parse($data);
  40. $ic = $img_obj->upload($c['tag']['tag1'][0]);
  41. var_dump($ic);
  42. $data = file_get_contents("e:/a.html");
  43. $c = $obj->parse($data);
  44. $ic = $img_obj->upload($c['tag']['tag1'][0]);
  45. var_dump($ic);
  46. */
  47. //var_dump($c['tag']['tag1']);
复制代码
  1. include_once $GLOBALS['g_dir_core'] . 'host/std.php';
  2. $site = array(
  3. 'aname' => '中华网娱乐图库',
  4. 'domain' => 'tuku.ent.china.com',
  5. 'dirname' => '目录名称,用于匹配基于目录不同的正文',
  6. 'gettype' => 'default',
  7. //获取主文件
  8. 'creg' => '/(?si)(.*?<\/list>)/',
  9. 'code' => 'utf-8',
  10. 'sub' => '获取子目录正则',
  11. 'content' => 'tag1',
  12. 'img_upload'=> array('tag1' => ''),
  13. //下一页
  14. 'reg_next' => '/(?is)
  15. //链接
  16. 'tag0' => '/(?is)
  17. 'tag0_brp' => array(
  18. array(
  19. '/(?is)\.htm/',
  20. ),
  21. array(
  22. '.xml'
  23. )
  24. ),
  25. //标题
  26. 'tag1' => '/(?is)
  27. 'tag1_arp' => array(
  28. array(
  29. '/(?is)\(图\)/',
  30. '/(?is)\"/',
  31. '/(?is)独家:/',
  32. '/(?is)独家:/',
  33. '/(?is)(《|》)/',
  34. ),
  35. array(
  36. '', '', '', '', '',
  37. )
  38. ),
  39. );
  40. $list_map = array(
  41. 'url' => 'tag0',
  42. 'title' => 'tag1',
  43. );
  44. $site_list_sub = array();
复制代码


source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template
About us Disclaimer Sitemap
php.cn:Public welfare online PHP training,Help PHP learners grow quickly!