<script><br />
var MyMar;<br />
function monitoring()<br />
{<br />
if(document.readyState =='complete')<br />
{<br />
window.location='?i='+query_get();<br />
}<br />
}<br />
<br />
function query_get()<br />
{ <br />
var querystr = window.location.href.split("=");<br />
<br />
if(!Number(querystr[1]))<br />
{<br />
var value=0;<br />
}else{<br />
var value=Number(querystr[1]);<br />
}<br />
<br />
return Number(value)+1; <br />
}<br />
<br />
MyMar=setInterval('monitoring()',3000);<br />
</script>
/*===========================================================
= 版权协议:
= GPL ()
=------------------------------------------------------------
= 摘 要:URL收集函数 PHP5
= 版 本:1.0
=------------------------------------------------------------
= 开源stal 项目组
= 更新作者:jd808
= 最后日期:2008-4-18
============================================================*/
$file='sitemap.xml'; //GOOGLE 需要的文件 执行时则做首页的临时URL存储文件
$temp_file='temp.xml';//内页URL临时存储文件
$url="http://www.gyqpw.com/"; //要搜索的网站
$timea=time();//开始时间 用户无需理它 只管上面3个参数即可
if(!$_GET['i'])
{
file_put_contents($file,'');
file_put_contents($temp_file,'');
file_put_contents($file,con($url,$timea));
echo "<script><br />
window.location='?i=bak';<br />
</script>";
}else{
consts($_GET['i'],$timea,$file,$url);
}
function con($url,$timea) //控制
{
echo "<script><br />
document.getElementById('link').innerHTML='正在收集 ".$url." 的信息!';<br />
</script>";
$str = file_get_contents($url);
$collection_url=collection_url($str,$url);
$collection_url=array_flip($collection_url);
foreach($collection_url as $key=>$value)
{
if(count(explode($url,$key))==2)
{
$strurl.=$key."\n";
}
}
return $strurl;
}
function consts($i,$timea,$file,$urlys)
{
$str =file_get_contents($file); //读取页面数据并生产字符串
$url=explode("\n",$str);
$sum=count($url)-1;
if($i=='bak')
{
$i=0;
}
/*进度条*/
$wid=round($i/$sum*100,2)."%";
$div="
$wid
";
echo '<script><br />
document.getElementById("scroll").innerHTML="'.$div.'";<br />
</script>';
ob_flush();//释放缓存
flush(); //将不再缓存里的数据发送到浏览器去
/*进度条END */
for($j=$i;$j
{
if(!$url[$j])
{
continue;
}
if(!detection_url($url[$j])) //检测URL是否合法
{
continue;
}
$timeb=time();//跟踪时间
if(($timeb-$timea)>=25)
{
memory($collection_url,$j); //存储数据
}
/* URL显示跟踪*/
echo "<script><br />
document.getElementById('link').innerHTML='".$url[$j]."';<br />
</script>";
ob_flush();//释放缓存
flush(); //将不再缓存里的数据发送到浏览器去
/* URL显示跟踪END*/
$urlstr=@file_get_contents($url[$j]);
$collection_url[]=collection_url($urlstr,$urlys);
$timec=time();//跟踪时间
if(($timec-$timea)>=25)
{
memory($collection_url,$j); //存储数据
}
if($j==$sum-1)
{
memorys(); //存储数据 主要是生成正式的xml
}
}
}
function collection_url($str,$url) //收集URL并返回一个数组(以页面为主)
{
preg_match_all('/
$urlexp=$matches[1];
for($j=0;$j
{
$urlexp[$j]=ltrim(str_replace("\r\n",'',$urlexp[$j]));
$urlexp[$j]=ltrim(str_replace("\n",'',$urlexp[$j]));
$urlexp[$j]=ltrim(str_replace("\r",'',$urlexp[$j]));
if($urlexp[$j]=='#')
{
continue;
}
if($urlexp[$j]=='/#')
{
continue;
}
if(!strchr($urlexp[$j],'http://'))
{//没有http://
$urlall[]=$url.$urlexp[$j];
echo $url.$urlexp[$j].'
';
print "<script>document.getElementById('logs').scrollTop = document.getElementById('logs').scrollHeight;</script>";
ob_flush();//释放缓存
flush(); //将不再缓存里的数据发送到浏览器去
}else{
if(count(explode($url,$urlexp[$j]))==2)
{
$urlall[]=$urlexp[$j];
echo $urlexp[$j].'
';
print "<script>document.getElementById('logs').scrollTop = document.getElementById('logs').scrollHeight;</script>";
ob_flush();//释放缓存
flush(); //将不再缓存里的数据发送到浏览器去
}else{
unset($urlexp[$j]);
}
}
}
return $urlall; //返回本页面搜索所得到的数组
}
function memory($collection_url,$i)
{
global $temp_file;
if(is_array($collection_url))
{
for($h=0;$h
{
for($l=0;$l
{
$strts.=$collection_url[$h][$l]."\n";
}
}
$wstr=file_get_contents($temp_file);
file_put_contents($temp_file,$wstr.$strts);
if($i==0)
{
$i=2;
}
$k=$i-1;
echo "<script><br />
window.location='?i=".$k."';<br />
</script>";
exit;
}
}
function memorys() //主要是生成正式的xml
{
global $temp_file,$file;
$file_arr=array_flip(file($file));
$temp_file_arr=array_flip(file($temp_file));
$xmla=''."\r\n".'
';
$xmlc="\r\n";
foreach($file_arr as $keya=>$valuea)
{
$keya=ltrim(str_replace("\r\n",'',$keya));
$keya=ltrim(str_replace("\n",'',$keya));
$keya=ltrim(str_replace("\r",'',$keya));
$xml.='
'.$keya.'
'.date("Y-m-d",time()).'
daily
';
}
foreach($temp_file_arr as $keyb=>$valueb)
{
$keyb=ltrim(str_replace("\r\n",'',$keyb));
$keyb=ltrim(str_replace("\n",'',$keyb));
$keyb=ltrim(str_replace("\r",'',$keyb));
$xml.='
'.$keyb.'
'.date("Y-m-d",time()).'
daily
';
}
$strts=$xmla.$xml.$xmlc;
file_put_contents($file,$strts);
echo "<script><br />
clearInterval(MyMar);<br />
document.getElementById('link').innerHTML='URL已经收集完成!';<br />
document.getElementById('all_a').innerHTML='<b>XML生成已完成!';<br />
</script>";
}
function detection_url($url)
{
if(strrchr($url,'='))
{
return true;
}
if(substr($url,strlen($url)-1,1)=='/')
{
return true;
}
$postfix= strrchr($url,'.');
switch ($postfix)
{
case ".php":
return true;
break;
case ".html":
return true;
break;
case ".htm":
return true;
break;
case ".asp":
return true;
break;
case ".aspx":
return true;
break;
case ".shtml":
return true;
break;
}
return false;
}
?>