本文为一个提取一批新闻网页中的文本的小程序,它可以将各篇新闻的内容存为以该新闻标题为文件名的文本文件。如有更好的处理方法,请和我联系:
lwx3069@sina.com
这里以人民网中的“今日要闻”下的新闻为例.
($url) ? "" : $url = "http://www.unn.com.cn/GB/channel2/3/11/index.html"; // 今日要闻
if(isset($url)&&$url!="") {
$str = implode("",file($url));
$str_ary = explode("
",$str);
$str_ary = explode("- ",trim($str_ary[1]));
for ($i=0; $i
if (strlen(trim($str_ary[$i]))
continue;
}
echo "新闻".$i.":".$str_ary[$i];
$str1=strstr("$str_ary[$i]",'
$len1=strlen("$str1");
$len2=strlen("$str2");
$len=$len1-$len2;
$url=substr("$str1",10,$len-10);
if (strlen(trim($url))!=0) {
$url = "http://www.unn.com.cn/".$url;
define(CONTENTS_DIR,"./contents/");
if(isset($url)&&$url!="") {
$str = implode("",file($url));
$str1=explode('',$str); //去掉文件没用的上半部分
$str2 = explode('
',$str1[1]);
//取出文件的下半部分,并去掉没用的下半部分,这时得到的都是有用的
$str3=explode('',$str2[0]); //从整个有用部分取出文件标题和正文
$str4=explode('
',$str2[0]); //取出日期和时间
$str5=explode('',$str3[1]); //从标题和正文部分取出标题
$title=str_replace("
","",$str5[0]);
$str3=explode('',$str2[0]); //从整个有用部分取出文件正文
$str3[1]=str_replace('
',"\n"." ",$str3[1]);
$str3[1]=str_replace(' ',"",$str3[1]);
$str3=strip_tags($str3[1]);
$pf=trim($title).".txt";
$ppf=fopen(CONTENTS_DIR."$pf",'w');
fputs($ppf,$title);
fputs($ppf,"$str4[0]");
fputs($ppf,$str3);
}
}
}
}
?>
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn