This article is a small program that extracts text from a batch of news web pages. It can save the content of each news article as a text file with the news title as the file name. If you have a better solution, please contact me:
lwx3069@sina.com
Here is the news under "Today's News" in People's Daily Online as an example.
( $url) ? "" : $url = "http://www.unn.com.cn/GB/channel2/3/11/index.html"; // Today's News
if(isset($url) &&$url!="") {
$str = implode("",file($url));
$str_ary = explode("
",$str);
$str_ary = explode("- ",trim($str_ary[1]));
for ($i=0; $iif (strlen(trim($str_ary[ $i]))continue;
}
echo "News".$i.":".$str_ary[$i];
$str1=strstr("$ str_ary[$i]",$str2=strstr("$str_ary[$i]"," target);
$len1=strlen("$str1");
$len2=strlen("$str2 ");
$len=$len1-$len2;
$url=substr("$str1",10,$len-10);
if (strlen(trim($url))! =0) {
$url = "http://www.unn.com.cn/".$url;
define(CONTENTS_DIR,"./contents/");
if(isset( $url)&&$url!="") {
$str = implode("",file($url));
$str1=explode(,$str); //Removing the file is useless Upper part
$str2 = explode(
,$str1[1]);
//Take out the lower part of the file and remove the useless lower part. This All the results obtained are useful
$str3=explode(,$str2[0]); //Get the file title and text from the entire useful part
$ str4=explode(,$str2[0]); //Get the date and time
$str5=explode(,$str3[1]); //From the title and take out the title from the text part
$title=str_replace("
","",$str5[0]);
$str3=explode(
,$str2[0]) ; //Get the file text from the entire useful part
$str3[1]=str_replace(
,"
"." ",$str3[1]);
$str3[1]=str_replace( ,"",$str3[1]);
$str3=strip_tags($str3[1]);
$pf=trim($title).".txt";
$ppf=fopen(CONTENTS_DIR."$pf",w);
fputs($ppf,$title);
fputs ($ppf,"$str4[0]");
fputs($ppf,$str3);
}
}
}
}
?>
http://www.bkjia.com/PHPjc/532189.htmlwww.bkjia.comtruehttp: //www.bkjia.com/PHPjc/532189.htmlTechArticleThis article is a small program that extracts text from a batch of news web pages. It can save the content of each news article. It is a text file with the news title as the file name. If there is a better way to deal with it,...