<?php
header("Content-type: text/html;charset=utf-8");
//set_time_limit(0);
$dbname = SAE_MYSQL_DB;
$host = SAE_MYSQL_HOST_M;
$port = SAE_MYSQL_PORT;
$user = SAE_MYSQL_USER;
$pwd = SAE_MYSQL_PASS;
$connect = @mysql_connect("{$host}:{$port}",$user,$pwd,true);
if(!$connect) {
die("Connect Server Failed:". mysql_error());
}
if(!mysql_select_db($dbname,$connect)) {
die("Select Database Failed:". mysql_error($connect));
}
mysql_query("set names 'utf8'");
$rules = array(
'start'=>'http://www.douluodalu.com.cn/jueshitangmen/6860.html',//开始采集的url
'title'=>'/<h1>(.*?)</h1>/',//文章title
'time'=>'/发布时间:(.*?) /',//发布时间
'content'=>'/"></div><p>([sS]*?)<div align=center>/',//内容
'next'=>'/下一篇: <a href="(.*?)"/',//下一篇网址
);
//每次排序,取出上一次的最后一篇url
$url = getLatest();
//最后一章的下一篇为空,由此循环
while($url != null && $url !=""){
$value = get($url);
$value = _prefilter($value);//去除空白字符,空格,回车
$context = getContent($value);
$context['url'] = $url;//当前url,同时还有下一篇的url
$url = $context['next'];
var_dump($url);
//防止重复
if(storage($context)){
storageWP($context);
};
}
echo"采集结束";
mysql_close($connect);
function storage($content_array){
global $connect;
$sql ="insert into `articles` (`id`, `title`, `time`, `url`, `content`) values(null,
'{$content_array['title']}',
'{$content_array['time']}',
'{$content_array['url']}',
'{$content_array['content']}');";
$result = mysql_query($sql,$connect);
return $result;
}
function storageWP($content_array){
global $connect;
$result = mysql_query("select max(ID) from wp_posts;",$connect);
$row = mysql_fetch_row($result);
$last_id = $row[0] +1 ;
$sql ="INSERT INTO `wp_posts` (`ID`, `post_author`, `post_date`, `post_date_gmt`, `post_content`, `post_title`, `post_excerpt`, `post_status`, `comment_status`, `ping_status`, `post_password`, `post_name`, `to_ping`, `pinged`, `post_modified`, `post_modified_gmt`, `post_content_filtered`, `post_parent`, `guid`, `menu_order`, `post_type`, `post_mime_type`, `comment_count`) VALUES (null,1,'{$content_array['time']}', '{$content_array['time']}', '{$content_array['content']}', '{$content_array['title']}', '', 'publish', 'open', 'open', '', '{$content_array['title']}', '', '', '{$content_array['time']}', '{$content_array['time']}', '', 0, 'http://iniu.sinaapp.com/?p={$last_id}', 0, 'post', '', 0);";
$result = mysql_query($sql,$connect);
$sql ="INSERT INTO `wp_term_relationships` (`object_id`, `term_taxonomy_id`, `term_order`) VALUES({$last_id}, 1, 0);";
$result = mysql_query($sql,$connect);
return $result;
}
function getContent($value){
global $rules;
preg_match($rules['title'],$value, $title);
preg_match($rules['time'],$value, $time);
preg_match($rules['next'],$value, $next);
preg_match($rules['content'],$value, $content);
$context = array(
'title' => addslashes($title[1]),
'time' => $time[1],
'next' => addslashes($next[1]),
'content' => addslashes($content[1])
);
return $context;
}
function getLatest(){
global $connect;
global $rules;
$sql ="SELECT url FROM `articles` ORDER BY id DESC LIMIT 1";
$result = mysql_query($sql,$connect);
$row=mysql_fetch_row($result);
if($row){
return $row[0];
}else{
return $rules['start'];
}
}
function get($url){
$ch = curl_init($url) ;
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true) ;
curl_setopt($ch, CURLOPT_BINARYTRANSFER, true) ;
$value = curl_exec($ch) ;
curl_close($ch);
return $value;
}
function _prefilter($output) {
strip_tags($output);
$output=preg_replace("///[Sftv ]*?;[r|n]/","", $output);
$output=preg_replace("/<!--[sS]*?-->/","", $output);
$output=preg_replace("/>[s]+</","><", $output);
$output=preg_replace("/;[s]+/",";", $output);
$output=preg_replace("/[s]+}/","}", $output);
$output=preg_replace("/}[s]+/","}", $output);
$output=preg_replace("/{[s]+/","{", $output);
$output=preg_replace("/([s]){2,}/","$1", $output);
$output=preg_replace("/[s]+=[s]+/","=", $output);
$output=preg_replace("/<br />/","",$output);
$output=preg_replace("/n/","",$output);
$output=preg_replace("/ /","",$output);
return $output;
}
?>
登入後複製