我是一个新手,想使用php抓取微信页面的正文和标题,标题直接用正则表达式|<title>(.*?)<\/title>|i
就抓取到了,但是正文我想的是|>(.*?)<\/|i
匹配标签间的内容,再去掉空格,但是还是会匹配到一堆标签,是怎么回事?或者说有其他更好的方法吗?求指教!
http://mp.weixin.qq.com/s?__biz=MzA5NTQ2NjUzMA==&mid=207136729&...
这是一般的微信文章内容链接
下面是这链接的页面代码
<!DOCTYPE html><html> <head> <script type="text/javascript">
var sampling = Math.random() < 0.001;
var page_begintime = (+new Date());
(sampling) && ((new Image()).src = "http://isdspeed.qq.com/cgi-bin/r.cgi?flag1=7839&flag2=7&flag3=8&15=1000&r=" + Math.random());
var biz = "MzA5NTQ2NjUzMA==";
var sn = "a82af7b7ba0bee9a7017b607dc7e5d4b" || "";
var mid = "207136729" || "";
var idx = "1" || "" ;
//辟谣需求
var is_rumor = ""*1;
var norumor = ""*1;
if (!!is_rumor&&!norumor){
if (!document.referrer || document.referrer.indexOf("mp.weixin.qq.com/mp/rumor") == -1){
location.href = "http://mp.weixin.qq.com/mp/rumor?action=info&__biz=" + biz + "&mid=" + mid + "&idx=" + idx + "&sn=" + sn + "#wechat_redirect";
}
}
//原创需求,需要跳转到中间页
/*
var copyrightInfo = {
display_source : ""*1,
nocopyrightsource : ""*1
};
if (!!copyrightInfo.display_source&&!copyrightInfo.nocopyrightsource){
if (!document.referrer || document.referrer.indexOf("mp.weixin.qq.com/mp/reprint") == -1){
location.href = "http://mp.weixin.qq.com/mp/reprint?action=info&__biz=" + biz + "&mid=" + mid + "&idx=" + idx + "&sn=" + sn + "#wechat_redirect";
}
}*/
</script> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><link rel="dns-prefetch" href="//res.wx.qq.com"><link rel="dns-prefetch" href="//mmbiz.qpic.cn"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=0" /><link rel="shortcut icon" type="image/x-icon" href="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/images/icon/common/favicon22c41b.ico"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="format-detection" content="telephone=no"><script type="text/javascript">
var uin = "";
var key = "";
var pass_ticket = "";
String.prototype.html= function(encode) {
var replace =["'", "'", """, '"', " ", " ", ">", ">", "<", "<", "&", "&", "¥", "¥"];
//console.log(replace);
if(encode){
replace.reverse();
}
for (var i=0,str=this;i< replace.length;i+= 2){
str=str.replace(new RegExp(replace[i],'g'),replace[i+1]);
}
return str;
};
pass_ticket = encodeURIComponent(pass_ticket.html(false).html(false).replace(/\s/g,"+"));
</script> <title>Living Music校园歌唱大赛半决赛距离开赛还有???</title> <link rel="stylesheet" type="text/css" href="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/style/page/appmsg/page_mp_article_improve251980.css"><style> </style><!--[if lt IE 9]><link rel="stylesheet" type="text/css" href="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/style/page/appmsg/page_mp_article_improve_pc25624b.css"><![endif]--><script type="text/javascript">
document.domain = "qq.com";
</script> </head> <body id="activity-detail" class="zh_CN mm_appmsg" ontouchstart=""> <script type="text/javascript">
var write_sceen_time = (+new Date());
(sampling) && ((new Image()).src = "http://isdspeed.qq.com/cgi-bin/r.cgi?flag1=7839&flag2=7&flag3=8&16=1000&r=" + Math.random());
</script> <p id="js_cmt_mine" class="discuss_container editing access" style="display:none;"> <p class="discuss_container_inner"> <h2 class="rich_media_title">Living Music校园歌唱大赛半决赛距离开赛还有???</h2> <p class="frm_textarea_box_wrp"> <span class="frm_textarea_box"> <textarea id="js_cmt_input" class="frm_textarea" placeholder="评论将由公众帐号筛选后显示,对所有人可见。"></textarea> </span> </p> <p class="discuss_btn_wrp"><a id="js_cmt_submit" class="btn btn_primary btn_discuss btn_disabled" href="javascript:;">提交</a></p> <p class="discuss_list_wrp" style="display:none"> <p class="rich_tips with_line title_tips discuss_title_line"> <span class="tips">我的评论</span> </p> <ul class="discuss_list" id="js_cmt_mylist"></ul> </p> <p class="rich_tips tips_global loading_tips" id="js_mycmt_loading"> <img src="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/images/icon/common/icon_loading_white22c04a.gif" class="rich_icon icon_loading_white" alt=""> <span class="tips">加载中</span> </p> <p class="wx_poptips" id="js_cmt_toast" style="display:none;"> <img alt="" class="icon_toast" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGoAAABqCAYAAABUIcSXAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAA3NpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNS1jMDE0IDc5LjE1MTQ4MSwgMjAxMy8wMy8xMy0xMjowOToxNSAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDoyMTUxMzkxZS1jYWVhLTRmZTMtYTY2NS0xNTRkNDJiOGQyMWIiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6MTA3QzM2RTg3N0UwMTFFNEIzQURGMTQzNzQzMDAxQTUiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6MTA3QzM2RTc3N0UwMTFFNEIzQURGMTQzNzQzMDAxQTUiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIChNYWNpbnRvc2gpIj4gPHhtcE1NOkRlcml2ZWRGcm9tIHN0UmVmOmluc3RhbmNlSUQ9InhtcC5paWQ6NWMyOGVjZTMtNzllZS00ODlhLWIxZTYtYzNmM2RjNzg2YjI2IiBzdFJlZjpkb2N1bWVudElEPSJ4bXAuZGlkOjIxNTEzOTFlLWNhZWEtNGZlMy1hNjY1LTE1NGQ0MmI4ZDIxYiIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/Pmvxj1gAAAVrSURBVHja7J15rF1TFMbXk74q1ZKHGlMkJVIhIgg1FH+YEpEQJCKmGBpThRoSs5jVVNrSQUvEEENIhGiiNf9BiERICCFIRbUiDa2qvudbOetF3Tzv7XWGffa55/uS7593977n3vO7e5+199p7v56BgQGh0tcmvAUERREUQVEERREUQVEERREUQVEERREUQVEERREUQVEERREUQVEERVAUQVEERVAUQbVYk+HdvZVG8b5F0xj4RvhouB+eCy8KrdzDJc1RtAX8ILxvx98V1GyCSkN98Cx4z/95/Wn4fj6j6tUEeN4wkFSnw1MJqj5NhBfAuwaUHREUg4lqNMmePVsHll/HFhVfe1t3FwpJI8DXCCquDrCWNN4B6Tb4M3Z98aTPmTvh0YHl18PXw29yZiKejoPvcUD6E74yFBJbVDk6Bb7K8aP/Hb4c/tRzEYIqprPhSxzlf4Uvhb/0Xoig8qnHAJ3lqPMzfDH8XZ4LEpRf2sVdA5/sqPO9Qfop70UJyn+/boaPddT5yrq7VUUvTIVJI7q74MMddXR8NB1eXcYvhBpZm0s2w72/o86HFoKvLau/pYaXzjLMdUJ6y0LwtWV9CIIaXtvA8+G9HHV03u5q+K+yH47U0NoRngPv7KjzHDwTLj0bS1BDazfJJlcnOOostC6ysnCT+q80G/sIvFVgeW09D8FPVT0uoP7VfvAD8NjA8pqmuAN+OcYAjso0RbIZ8DGB5TVNcRO8JMaHY9SXSdfa3eeANJimWBLrA7JFiZwIXye+NMUV8CcxP2SRFjXefok7NRjSGZJlWUPvw2/wtNiQirSoXWyMsR28wR7AzzYM0oXw+Y7yK+CLJGeaoqjyrJSdZJD6Ov4+z5y6NJc0Az7NUecHydIUy+v60KNyQHoM3nKI1y7YCFiq0i7uBvgER52vDdKqWn9djhY1Dn4G3n6Ecqm2rF74dvgoR53S0hQxW9RJAZAGW5bSn58QJA27dQ7uIEedjywEX5NKVxCqsY6y+qA+LxFI4+yZ6oH0trWkNan80jygtIUsc5SflgAsDXgehfdx1KkkTRE76tN+Xue2jnTU0Ru1oIbvpt30bBtKhOp5yaaRkts0lic8V1i6dPcIRx2d/l8Y8XtNNEg7OOo8bl1kmmOKnDsO88CaYzejau0hWZqiL7C83oCH4SeTHvwV2BqqsHRVztSEYOmWF80NeXZT6Hd4KflResE9vCnBOlCyGfDNAstHTVPUDWoQ1t3iW+9WNizvlhfd4aerXd+ThqiMfNR6+9LvOOro5OY5JX2H4+F7HZD+kGzlamMgldWiirQsjcwWFbjmqZJteekJLK9pisvgL6RhKvuciZiwzrWWGapfrPy30kBVcSBIrw0aD3PU0XB6cehntq7rTMf7/2iQlktDVdXJLXlg6VjmiYBn6rWSTRCH6hvJ0hQrpcGq8oidsmHpTP8t8DGO9/vcWt9qabiqPgup1yKyQwvC2tSefZ73SSpNkUJ4PlLorlHZ+446nc8f3fIyywlJhwrTuwVSjBa1ccvSxN0hjjoK5xVrYZMd9V6XbFfgBukixTwGLg8sDam3dZR/wZ6L/dJlin1en8LS+bgpFbz3Ygvzu1J1HKxYNqxGpCmaCEo12rrBorD6LRp8UbpcdR5VWhTW35KlKd6QFqjuM2XzwlpnMxTvSkuUwuG/Xlg6NtPjbT6WFimF/VG6LEvXgn8QGDjMbBukVECFwhpoS+CQatfX2Q1q6H7wENHdrfCr0lKleEB9JyxNneus+VJpsVL9TwI6W65LovWIGl3KtVJaLv7LBwYTFEERFEVQFEERFEVQFEERFEVQFEERFEVQFEERFEVQFEERFFWq/hFgADUMN4RzT6/OAAAAAElFTkSuQmCC"> <p class="toast_content">已评论</p> </p> </p> </p> <p id="js_article" class="rich_media"> <p id="js_top_ad_area" class="top_banner"> </p> <p class="rich_media_inner"> <p id="page-content"> <p id="img-content" class="rich_media_area_primary"> <h2 class="rich_media_title" id="activity-name"> Living Music校园歌唱大赛半决赛距离开赛还有??? </h2> <p class="rich_media_meta_list"> <em id="post-date" class="rich_media_meta rich_media_meta_text">2015-04-27</em> <a class="rich_media_meta rich_media_meta_link rich_media_meta_nickname" href="javascript:void(0);" id="post-user">gzgsytw</a> <span class="rich_media_meta rich_media_meta_text rich_media_meta_nickname">gzgsytw</span> <p id="js_profile_qrcode" class="profile_container" style="display:none;"> <p class="profile_inner"> <strong class="profile_nickname">gzgsytw</strong> <img class="profile_avatar" id="js_profile_qrcode_img" src="" alt=""> <p class="profile_meta"> <label class="profile_meta_label">微信号</label> <span class="profile_meta_value">gzgsytw0</span> </p> <p class="profile_meta"> <label class="profile_meta_label">功能介绍</label> <span class="profile_meta_value">校园内有关团委的一切动态一切资讯都可在这里接收查找。</span> </p> </p> <span class="profile_arrow_wrp" id="js_profile_arrow_wrp"> <i class="profile_arrow arrow_out"></i> <i class="profile_arrow arrow_in"></i> </span> </p> </p> <p class="rich_media_content" id="js_content"><fieldset class="tn-Powered-by-XIUMI" style="white-space: normal; border: 0px; text-align: center; margin: 0.8em 0px 0.5em; box-sizing: border-box; padding: 0px;"><span class="tn-Powered-by-XIUMI" style="display: inline-block; padding: 0.3em 0.5em; border-radius: 0.5em; color: rgb(255, 255, 255); text-align: inherit; font-size: 1em; box-shadow: rgb(165, 165, 165) 0.2em 0.2em 0.1em; font-family: inherit; font-weight: inherit; text-decoration: inherit; border-color: rgb(71, 193, 168); box-sizing: border-box; background-color: rgb(71, 193, 168);"><p style="box-sizing: border-box;"><span style="font-size: 24px;">Living Music校园歌唱大赛半决赛距离开赛还有???</span></p></span></fieldset><p style="white-space: normal;"><br /></p><fieldset class="tn-Powered-by-XIUMI" style="white-space: normal; border: 0px; margin: 0px; clear: both; box-sizing: border-box; padding: 0px;"><section class="tn-Powered-by-XIUMI" style="padding: 8px; margin: 0px; border-left-width: 6px; border-left-style: solid; border-color: rgb(71, 193, 168); font-size: 1em; line-height: 1.4; text-align: inherit; font-family: inherit; font-weight: inherit; text-decoration: inherit; color: rgb(51, 51, 51); box-sizing: border-box;"><section class="tn-Powered-by-XIUMI" style="box-sizing: border-box;"><p><span style="font-size: 18px; font-family: 宋体;">精彩绝伦的复赛没有来观看?没关系!30进15的淘汰赛已经结束,15进5的半决赛就在<span style="font-size: 36px; font-family: 宋体; color: rgb(49, 133, 155);"><strong>今晚!</strong></span></span></p></section></section></fieldset><p style="white-space: normal;"><img data-s="300,640" data-type="jpeg" src="http://mmbiz.qpic.cn/mmbiz/UeZYRwyWLxVbrSdHIQokUcfwqLgHDeHAOIct2NKByo5P7WOGqux1uqDFAfUH922afy36pc8Ge8t8nyNCxKK6Ng/0" data-ratio="0.726790450928382" data-w="377" /><br /></p><p style="white-space: normal;"><br /></p><fieldset class="tn-Powered-by-XIUMI" style="white-space: normal; border: 0px; margin: 0px; clear: both; box-sizing: border-box; padding: 0px;"><section class="tn-Powered-by-XIUMI" style="padding: 8px; margin: 0px; border-left-width: 6px; border-left-style: solid; border-color: rgb(71, 193, 168); font-size: 1em; line-height: 1.4; text-align: inherit; font-family: inherit; font-weight: inherit; text-decoration: inherit; color: rgb(51, 51, 51); box-sizing: border-box;"><section class="tn-Powered-by-XIUMI" style="box-sizing: border-box;"><p><span style="font-size: 36px;"><strong><span style="font-family: 宋体; color: rgb(49, 133, 155);">今晚六点半</span></strong></span><span style="font-size: 18px;"><strong><span style="font-family: 宋体; color: rgb(49, 133, 155);"></span></strong><span style="font-family: 宋体;">Living Music大赛花都校区的十五强选手将在<span style="font-size: 36px; font-family: 宋体; color: rgb(49, 133, 155);"><strong>实验楼105</strong></span>举行的半决赛当中角逐出进入最终决赛的五名选手,届时将和三水校区进入决赛的五名选手共同组成广工商十强歌手来竞争广工商<span style="font-size: 36px; font-family: 宋体; color: rgb(255, 0, 0);"><strong>最强音的宝座!!</strong></span></span></span></p></section></section></fieldset><p style="white-space: normal;"><img data-s="300,640" data-type="jpeg" src="http://mmbiz.qpic.cn/mmbiz/UeZYRwyWLxVicCIE0fvVEsL865N96Ds2cuLr8deLzfyGmmNLd5dKgP4m0FQ57uNewg4tMn2wiadSxhEMqptr1dxQ/0" data-ratio="0.5964912280701754" data-w="456" /><br /></p><fieldset class="tn-Powered-by-XIUMI" style="white-space: normal; border: 0px; margin: 0px; clear: both; box-sizing: border-box; padding: 0px;"><section class="tn-Powered-by-XIUMI" style="padding: 8px; margin: 0px; border-left-width: 6px; border-left-style: solid; border-color: rgb(71, 193, 168); font-size: 1em; line-height: 1.4; text-align: inherit; font-family: inherit; font-weight: inherit; text-decoration: inherit; color: rgb(51, 51, 51); box-sizing: border-box;"><section class="tn-Powered-by-XIUMI" style="box-sizing: border-box;"><span style="font-size: 18px;">广工商最强音的诞生需要你的见证,come on 小伙伴,即使不是最专业的点评家,你也可以是<span style="font-size: 36px; color: rgb(49, 133, 155);"><strong>最专业的聆听者</strong></span><span style="font-size: 36px; color: rgb(49, 133, 155);">!</span></span></section></section></fieldset><p style="white-space: normal;"><img data-s="300,640" data-type="jpeg" src="http://mmbiz.qpic.cn/mmbiz/UeZYRwyWLxV1eIfFV9Do2a4aUp0koiadsKydXaDL3d696kOuEpXq5XSbrHk86fnicm6tG8ZPlQI6BsukIMlQmjGg/0" data-ratio="1.0588235294117647" data-w="289" /><br /></p><fieldset class="tn-Powered-by-XIUMI" style="color: inherit; font-family: inherit; font-size: 1em; font-weight: inherit; text-align: inherit; white-space: normal; border: 1px solid rgb(226, 226, 226); box-shadow: rgb(226, 226, 226) 0px 16px 1px -10px; line-height: 1.6; text-decoration: inherit; box-sizing: border-box; padding: 0px; background-color: rgb(255, 255, 255);"><section class="tn-Powered-by-XIUMI" style="padding: 20px; color: rgb(255, 255, 255); text-align: center; font-weight: bold; line-height: 1.4; box-shadow: rgb(221, 221, 221) 0px 3px 3px; font-size: 1.4em; font-family: inherit; text-decoration: inherit; border-color: rgb(71, 193, 168); box-sizing: border-box; background-color: rgb(71, 193, 168);"><section class="tn-Powered-by-XIUMI" style="box-sizing: border-box;"><strong><span style="font-family: 宋体; font-size: 24px; line-height: 25.2000007629395px;"><span style="line-height: 28px;">Living Music大赛花都校区</span>15进5的半决赛</span></strong></section></section><p style="margin-top: 24px; text-align: inherit; font-size: 1em; font-family: inherit; font-weight: inherit; text-decoration: inherit; color: inherit; border-color: rgb(71, 193, 168); box-sizing: border-box; line-height: 2em; background-color: transparent;"><img src="http://mmbiz.qpic.cn/mmbiz/MVPvEL7Qg0GPLZlicQq2RYNicbDmd1xQT0Gicv1A0tlNRtyWzrwk4Odopjzzpwqo4YJkUl3x7nMbGeLATIszGPJ2Q/640" class="tn-Powered-by-XIUMI" data-ratio="1" data-w="30" style="width: 30px; vertical-align: top; margin-left: 16px; box-sizing: border-box; background-color: rgb(71, 193, 168);" /> <span style="line-height: 25.6000003814697px; color: inherit; font-family: inherit; font-size: 1em; font-weight: inherit; text-align: inherit; text-decoration: inherit;">4月27日18:30(即今晚)</span></p><p style="margin-top: 16px; text-align: inherit; font-size: 1em; font-family: inherit; font-weight: inherit; text-decoration: inherit; color: inherit; border-color: rgb(71, 193, 168); box-sizing: border-box; background-color: transparent;"><img src="http://mmbiz.qpic.cn/mmbiz/MVPvEL7Qg0GPLZlicQq2RYNicbDmd1xQT0xTVEAtw1fibKcv7QmGPZ2SPibJyDvCcAI1jgWFZxhxb1XSBViae9ibboJg/640" class="tn-Powered-by-XIUMI" data-ratio="1" data-w="30" style="width: 30px; vertical-align: top; margin-left: 16px; box-sizing: border-box; background-color: rgb(71, 193, 168);" /> <span style="line-height: 25.6000003814697px; color: inherit; font-family: inherit; font-size: 1em; font-weight: inherit; text-align: inherit; text-decoration: inherit;">实验楼105</span></p><p style="margin-top: 16px; text-align: inherit; font-size: 1em; font-family: inherit; font-weight: inherit; text-decoration: inherit; color: inherit; border-color: rgb(71, 193, 168); box-sizing: border-box; background-color: transparent;"><span style="line-height: 25.6000003814697px; color: inherit; font-family: inherit; font-size: 1em; font-weight: inherit; text-align: inherit; text-decoration: inherit;"><br /></span></p></fieldset><p><br /></p></p> <script type="text/javascript">
var first_sceen__time = (+new Date());
</script> <link rel="stylesheet" type="text/css" href="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/style/page/appmsg/page_mp_article_improve_combo25624b.css"> <p class="rich_media_tool" id="js_toobar"> <p id="js_read_area" class="media_tool_meta tips_global meta_primary" style="display:none;">阅读 <span id="readNum"></span></p> <span style="display:none;" class="media_tool_meta meta_primary tips_global meta_praise" id="like"> <i class="icon_praise_gray"></i><span class="praise_num" id="likeNum"></span> </span> <a id="js_report_article" style="display:none;" class="media_tool_meta tips_global meta_extra" href="javascript:void(0);">举报</a> </p> </p> <p class="rich_media_area_extra"> <p class="mpda_bottom_container" id="js_bottom_ad_area"></p> <p id="js_iframetest" style="display:none;"></p> </p> </p> <p id="js_pc_qr_code" class="qr_code_pc_outer" style="display:none;"> <p class="qr_code_pc_inner"> <p class="qr_code_pc"> <img id="js_pc_qr_code_img" class="qr_code_pc_img"> <p>微信扫一扫<br>关注该公众号</p> </p> </p> </p> </p> </p> <script>window.moon_map = {"a/gotoappdetail.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/a/gotoappdetail2562f8.js","a/ios.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/a/ios24a769.js","a/android.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/a/android22772d.js","a/profile.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/a/profile24a2ff.js","biz_common/utils/report.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/utils/report224ef3.js","biz_common/utils/cookie.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/utils/cookie224ef3.js","appmsg/reward_entry.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/reward_entry256315.js","appmsg/comment.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/comment255696.js","appmsg/like.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/like2340dc.js","appmsg/a.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/a25624b.js","biz_common/tmpl.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/tmpl224ef3.js","biz_common/ui/imgonepx.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/ui/imgonepx224ef3.js","biz_common/dom/attr.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/dom/attr22f190.js","biz_wap/utils/ajax.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_wap/utils/ajax22589f.js","biz_common/utils/string/html.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/utils/string/html224ef3.js","appmsg/report.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/report23c757.js","biz_common/dom/class.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/dom/class236751.js","appmsg/report_and_source.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/report_and_source23a582.js","appmsg/page_pos.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/page_pos253aa9.js","appmsg/cdn_speed_report.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/cdn_speed_report224ef3.js","appmsg/iframe.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/iframe24f185.js","appmsg/review_image.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/review_image2480be.js","appmsg/outer_link.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/outer_link224ef3.js","biz_wap/jsapi/core.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_wap/jsapi/core22589f.js","biz_common/dom/event.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/dom/event24f08a.js","appmsg/async.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/async25624b.js","biz_wap/ui/lazyload_img.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_wap/ui/lazyload_img23354e.js","biz_common/log/jserr.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/log/jserr22589f.js","appmsg/share.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/share251ea5.js","biz_wap/utils/mmversion.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_wap/utils/mmversion224ef3.js","appmsg/cdn_img_lib.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/cdn_img_lib23c757.js","biz_common/utils/url/parse.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_common/utils/url/parse238f07.js","appmsg/index.js":"http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/appmsg/index2567e4.js"};</script><script type="text/javascript" src="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/js/biz_wap/moon230eaa.js"></script> <script id="t_cmt" type="text/html">
<li class="discuss_item" id="cid<# if (is_from_me == 1) { #><#=my_id#><# } else { #><#=content_id#><# } #>">
<# if(is_elected == 1){ #>
<p class="discuss_opr">
<span class="media_tool_meta tips_global meta_praise js_comment_praise <# if(like_status == 1){ #>praised<# } #>" data-status="<#=like_status#>" data-content-id='<#=content_id#>'>
<i class="icon_praise_gray"></i>
<span class="praise_num"><# if(like_num_format !== 0){ #><#=like_num_format#> <# } #></span>
</span>
</p>
<# } #>
<p class="user_info">
<strong class="nickname"><#=nick_name#><# if(is_from_friend == 1){ #>(朋友)<# } #></strong>
<img class="avatar" src="<#=logo_url#>">
</p>
<p class="discuss_message">
<span class="discuss_status"><#=status#></span>
<#=content#>
</p>
<p class="discuss_extra_info">
<#=time#>
<# if (is_from_me == 1) { #>
<a class="discuss_del js_del" href="javascript:;" data-my-id="<#=my_id#>" data-content-id="<#=content_id#>">删除</a>
<# } #>
</p>
<# if(reply && reply.reply_list && reply.reply_list.length > 0){ #>
<p class="reply_result">
<p class="nickname">作者回复</p>
<p class="discuss_message"><#=reply.reply_list[0].content#></p>
<p class="discuss_extra_info"><#=reply.reply_list[0].time#></p>
</p>
<# } #>
</li>
</script> <script id="t_ad" type="text/html">
<p class="rich_media_extra" id="gdt_area">
<# if(pos_type==0){ #>
<p class="rich_tips with_line title_tips">
<span class="tips">推广</span>
</p>
<# } #>
<p class="js_ad_link extra_link" data-type="<#=type#>" data-ticket="<#=ticket#>" data-url="<#=url#>" data-rl="<#=rl#>" data-aid="<#=aid#>" data-pt="<#=pt#>" data-tid="<#=traceid#>" data-gid="<#=group_id#>" data-apurl="<#=apurl#>">
<# if(pt==1){ #>
<#=hint_txt#>
<img class="icon_arrow_gray" src="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/images/icon/common/icon_arrow_gray1ef6d4.png">
<img class="icon_loading_white icon_after" style="display:none;" id="loading_<#=traceid#>" src="http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/images/icon/common/icon_loading_white22c04a.gif">
<# }else if(pt==2){ #>
<# if (logo.indexOf("http://mmsns.qpic.cn/") == 0){ #>
<p class="brand_logo"><img src="<#=logo#>" alt="logo图片"></p>
<# } #>
<img class="appmsg_banner" src="<#=image_url#>">
<# if(watermark_type!=0){ #><i class="promotion_tag"><# if(watermark_type==1){ #>商品推广<# }else if (watermark_type==2){ #>活动推广<# } #></i><# } #>
<# }else if(pt==7){ #>
<p class="preview_group preview_card">
<p class="preview_group_inner card_inner">
<p class="preview_group_info">
<strong class="preview_group_title2"><#=hint_txt#></strong>
<p class="preview_group_desc"><#=ad_desc#></p>
<img src="<#=image_url#>" alt="" class="preview_card_avatar">
</p>
</p>
</p>
<# }else if(pt==100){ #>
<p class="preview_group">
<p class="preview_group_inner">
<p class="preview_group_info append_btn">
<strong class="preview_group_title"><#=biz_info.nick_name#></strong>
<p class="preview_group_desc"><#=hint_txt#></p>
<# if(!!biz_info.head_img){ #>
<img src="<#=biz_info.head_img#>" alt="" class="preview_group_avatar br_radius">
<# }else{ #>
<img class="preview_group_avatar br_radius" src="http://mmbiz.qpic.cn/mmbiz/a5icZrUmbV8p5jb6RZ8aYfjfS2AVle8URwBt8QIu6XbGewB9wiaWYWkPwq4R7pfdsFibuLkic16UcxDSNYtB8HnC1Q/0" alt="<#=biz_info.nick_name#>">
<# } #>
</p>
<p class="preview_group_opr">
<a id="js_view_profile_<#=pos_type#>" <# if(biz_info.is_subscribed == 0){ #>style="display:none"<# } #> class="btn btn_inline btn_primary btn_line js_ad_btn" href="javascript:void(0);">查看</a>
<a id="js_add_contact_<#=pos_type#>" data-url="<#=url#>" data-type="<#=type#>" data-tid="<#=traceid#>" data-rl="<#=rl#>" <# if(biz_info.is_subscribed ==1){ #>style="display:none"<# } #> class="btn btn_inline btn_line btn_primary js_ad_btn" href="javascript:void(0);">关注</a>
</p>
</p>
</p>
<# }else if(pt==102){ #>
<p class="preview_group">
<p class="preview_group_inner">
<p class="preview_group_info append_btn">
<strong class="preview_group_title"><#=app_info.app_name#></strong>
<p class="preview_group_desc"><#=hint_txt#></p>
<img src="<#=app_info.icon_url#>" alt="" class="preview_group_avatar br_radius">
</p>
<p class="preview_group_opr">
<a id="js_app_action_<#=pos_type#>" class="btn btn_inline btn_primary js_ad_btn btn_download" href="javascript:void(0);">下载</a>
</p>
</p>
</p>
<# }else if(pt==101){ #>
<p class="preview_group preview_card">
<p class="preview_group_inner card_inner">
<p class="preview_group_info append_btn">
<strong class="preview_group_title"><#=app_info.app_name#></strong>
<p class="preview_group_desc"><#=hint_txt#></p>
<img src="<#=app_info.icon_url#>" alt="" class="preview_card_avatar">
</p>
<p class="preview_group_opr">
<a href="javascript:void(0);" id="js_app_action_<#=pos_type#>" class="btn btn_inline btn_primary js_ad_btn">下载</a>
</p>
</p>
</p>
<# }else if(pt==103||pt==104){ #>
<p class="preview_group obvious_app">
<p class="preview_group_inner">
<p class="pic_app">
<img src="<#=image_url#>" alt="">
</p>
<p class="info_app">
<p class="name_app"><#=app_info.app_name#></p>
<# if(pt==103){ #>
<p class="profile_app" style="display:none;"><span class="fun_exp"><#=app_info._category#></span><em>|</em><span class="compacity"><#=app_info._app_size#></span></p>
<# } else if(pt==104){ #>
<p class="profile_app" style="display:none;"><span class="fun_exp"><#=app_info._app_size#></span><em>|</em><span class="compacity"><#=app_info._down_count#></span></p>
<# } #>
<p class="grade_app" id="js_app_rating_<#=pos_type#>">
<span class="js_stars stars" style="display:none;"></span>
<span class="js_scores scores">暂无评分</span>
</p>
<p class="dm_app">
<a href="javascript:void(0);" id="js_appdetail_action_<#=pos_type#>" class="ad_btn btn_download js_ad_btn">下载</a>
<p class="extra_info">来自<# if(pt==103){ #>App Store<# }else{ #>腾讯应用宝<# } #></p>
</p>
</p>
</p>
</p>
<# } #>
</p>
</p>
</script>
<script type="text/javascript">
var not_in_mm_css = "http://res.wx.qq.com/mmbizwap/zh_CN/htmledition/style/page/appmsg/not_in_mm24ed02.css";
var tid = "";
var aid = "";
var appuin = "MzA5NTQ2NjUzMA==";
var source = "5";
var scene = 75;
var itemidx = "";
var nickname = "gzgsytw";
var ct = "1430117426";
var user_name = "gh_33771b05aabb";
var user_name_new = "";
var fakeid = "";
var version = "";
var is_limit_user = "0";
var msg_title = "Living Music校园歌唱大赛半决赛距离开赛还有???";
var msg_desc = "Living Music校园歌唱大赛半决赛距离开赛还有???精彩绝伦的复赛没有来观看?没关系!30进15的淘";
var msg_cdn_url = "http://mmbiz.qpic.cn/mmbiz/UeZYRwyWLxUfLK8xco6YMfZA4gghicMQOabZNs7h3QsZqiaXAwKzic0ca3SESBko2AeeeOwFpkyGs41cTrhL6ewhA/0?wx_fmt=jpeg";
var msg_link = "http://mp.weixin.qq.com/s?__biz=MzA5NTQ2NjUzMA==&mid=207136729&idx=1&sn=a82af7b7ba0bee9a7017b607dc7e5d4b#rd";
var user_uin = "0"*1;
var msg_source_url = '';
var img_format = 'jpeg';
var networkType;
var appmsgid = '' || '207136729';
var comment_id = "0" * 1;
var svr_time = "1430658137" * 1;
var comment_enabled = "" * 1;
var is_need_reward = "0" * 1;
var is_https_res = ("" * 1) && (location.protocol == "https:");
var devicetype = "";
seajs.use("appmsg/index.js");
</script>
</body></html>
随便折腾~见笑
这种问题,放弃用正则吧,要真正完备的你可能得写一篇文章那样长。但 html 已经是结构化的了,找个 html 解析库就行了,我刚用 firebug 粗略的看了下,正文是在 id 为 page-content 的 p 里。
当然,如果我所看到的是受 js 影响后的页面状态,那你可能得挂个浏览器内核来帮你获取到最终页面状态了。幸好,这个有很多开源的。
可以尝试使用DOM操作库simple-html-dom.php,快速获取HTML结构的内容:
可以把抓取到的内容写入置于内存上的SQLite(/run/shm/php/crawler.db3),避免频繁的磁盘IO.
把文章页面代码贴上来。
曾经尝试爬取微信文章,结果被黑警告 = =
没必要用正则表达式的,可以用PHP像jQuery那样直接操作DOM的,你可以参考一下。 http://www.cnphp.info/php-simple-html-dom-parser-intro.html
我之前使用的phpquery去抓的微信文章,从搜狗进去的,但是不能太频繁,我是把列表抓下来,存入数据库,然后用计划任务60秒抓一次详情,这里的详情注意里面的html解析,我记得视频地址跟图片解析是要注意下的,
建议使用selenium工具通过浏览器渲染获取到加载完成后的界面,然后通过界面解析获取想要的内容~
看到一个爬微信的站http://dedecms5.com
是用工具库
phpquery
类似jQuery的方式解析html
用来抓站什么的很方便