Home > php教程 > php手册 > body text

分享下页面关键字抓取components.arrow.com站点代码

WBOY
Release: 2016-06-13 09:44:04
Original
883 people have browsed it

复制代码 代码如下:


 /**
 * HOST: components.arrow.com
 */
 //set_time_limit(0);
 // base function
 function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')
 {
 $ch = curl_init();
 if (!empty($data)) {
 $data = is_array($data)?http_build_query($data): $data;
 $url .= (strpos($url,'?')? '&': "?") . $data;
 }
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_POST, 0);
 curl_setopt($ch, CURLOPT_PORT, $port);
 curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面
 $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);
 if($proxy) {
 curl_setopt($ch, CURLOPT_PROXY, $proxy);
 curl_setopt($ch, CURLOPT_PROXYPORT, 1723);
 curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");
 }

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:\n" . curl_error($ch);

}
 curl_close($ch);
 return $result;
 }

复制代码 代码如下:


function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80)
 {
 $ch = curl_init();
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_PORT, $port);
 !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_POST, 1);
 curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:\n" . curl_error($ch);

}
 curl_close($ch);

return $result;
 }

/**
 * 获取列表页的html源码
 * @param string $keywords 搜索关键字
 * @param int $start 开始记录数
 * @return boolean|array
 */
 function getListHtml($keywords, $start = 0)
 {
 if ($start  {
 return false;
 }

$postData = array(
 'search_token' => $keywords,
 'start' => $start,
 'limit' => 100,
 );

$result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData));
 if ( isset($result['error']) )
 {
 return false;
 //exit($result['error']);
 }
 $result = $result['result'];

return $result;
 }

/**
 * 获取列表页 连接href
 * @param string $html html源码
 * @return array
 */
 function getListHref($html)
 {
 $pattern = '/

]+)">/isU';
 if (preg_match_all($pattern, $html, $matches))
 {
 return $matches[1];
 } else {
 // 没有匹配项
 return array();
 }
 }

/**
 * 获取下一页数字start
 * @param string $html html源码
 * @return number
 */
 function getListNextPage($html)
 {
 $pattern = '/<script>buildPagination\(\'\d+\',\'\d+\',\'(\d+)\',\d+\);<\/script>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> return intval($matches[1]);<BR> } else {<BR> return -1;<BR> }<BR> }<br><br>/**<BR> * 获取列表也所有的详细列表<BR> * @param string $keywords 搜索关键字<BR> * @return boolean|array<BR> */<BR> function getListHrefAll($keywords)<BR> {<BR> if (empty($keywords))<BR> {<BR> return false;<BR> }<br><br>$html = getListHtml($keywords);<BR> $hrefList = getListHref($html);<BR> if (empty($hrefList))<BR> {<BR> // 没有结果<BR> return array();<BR> }<BR> $nextPage = getListNextPage($html);<BR> $loop =0;<BR> while ($nextPage > 0)<BR> {<BR> $html = getListHtml($keywords, $nextPage);<BR> $tmpHrefList = getListHref($html);<BR> $hrefList = array_merge($hrefList, $tmpHrefList);<BR> $nextPage = getListNextPage($html);<BR> $loop ++;<BR> }<BR> return $hrefList;<BR> }<br><br>/**<BR> * 获取详情页信息<BR> * @param string $url url地址<BR> * @return array()<BR> */<BR> function getDetail($url)<BR> {<BR> if ( empty($url) )<BR> {<BR> return false;<BR> }<BR> $host = 'http://components.arrow.com';<br><br>$url = $host . $url;<BR> $result = curl_get($url);<BR> if ( isset($result['error']) )<BR> {<BR> return array();<BR> //exit($result['error']);<BR> }<BR> $html = $result['result'];<br><br>$result = array(<BR> 'sup_part' => '', // 供应商型<BR> 'sup_id' => '', // 供应商ID<BR> 'mfg_part' => '', // 制造商型号<BR> 'mfg_name' => '', // 制造商名称<BR> 'cat_name' => '', // 分类名称<BR> 'para' => '', // 属性<BR> 'desc' => '', // 描述<BR> 'pdf_url' => '', // PDF地址<BR> 'sup_stock' => '', // 库存<BR> 'min_purch' => '', // 最小订购量<BR> 'price' => '', // 价格<BR> 'img_url' => '', // 图片地址<BR> 'createtime' => '', // 创建时间<BR> 'datacode' => '', // 批号<BR> 'package' => '', // 封装<BR> 'page_url' => '', // 页面地址<BR> );<br><br>// mfg_part<BR> $pattern = '/<li>[\s\n]*<strong>Part No:\s*<\/strong>(.+)<\/li>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['mfg_part'] = trim($matches[1]);<BR> } else {file_put_contents('page.txt', $html);die('xxx');<BR> return array();<BR> }<br><br>// mfg_name<BR> $pattern = '/<li>[\s\n]*<strong>Manufacturer: <\/strong>(.+)<\/li>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['mfg_name'] = trim($matches[1]);<BR> }<br><br>// cat_name<BR> $pattern = '/displayCategory\(\'(.[^\']+)\'\);/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['cat_name'] = trim($matches[1]);<BR> $result['cat_name'] = str_replace('|', '>', $result['cat_name']);<BR> }<br><br>// para<BR> $tablepattern = '/<table\s+id="part_specs".[^>]*>(.+)<\/table>/isU';<BR> if (preg_match($tablepattern, $html, $matches))<BR> {<BR> $pattern = '/<tr>[\s\n]*<td><strong>(.+)<\/strong><\/td><td>(.+)<\/td>[\s\n]*<\/tr>/isU';<BR> if (preg_match_all($pattern, $matches[1], $matches))<BR> {<BR> foreach($matches[1] as $k=>$v)<BR> {<BR> $v = trim($v);<BR> if ('Package Type' == $v)<BR> {<BR> $result['package'] = trim($matches[2][$k]);<BR> continue;<BR> }<BR> $result['para'][$v] = trim($matches[2][$k]);<BR> }<BR> }<BR> }<br><br>// desc<BR> $pattern = '/<div\s+id="part_title">.+<h4>(.+)<\/h4>[\s\n]*<\/div>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['desc'] = trim($matches[1]);<BR> }<br><br>// pdf_url<BR> $pattern = '/<li\s+class="datasheet">[\s\n]*<strong>Datasheet:<\/strong><a\s+href="(.[^"]+)"/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['pdf_url'] = $host . trim($matches[1]);<BR> }<br><br>// sup_stock<BR> $pattern = '/<td\s+id="inv_1"\s+class="li_inv">([\d,]+)<\/td>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['sup_stock'] = trim($matches[1]);<BR> $result['sup_stock'] = str_replace(',', '', $result['sup_stock']);<BR> }<br><br>// min_purch<BR> $pattern = '/<span\s+id="multiples">[\s\n]*<strong>Multiple:\s*<\/strong>(.+)<\/span>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['min_purch'] = trim($matches[1]);<BR> }<br><br>// price<BR> $pattern = '/<div\s+id="price_1"\s+class="li_price">(.[^<]+)<\/div>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['price'][1] = trim($matches[1]);<BR> }<BR> $pattern = '/<div\s+id="price_1"\s+class="li_price">[\s\n]*<span.[^>]+title="(.[^"]+)">/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $priceurl = str_replace('&', '&', $matches[1]);<BR> $json = curl_get($priceurl);<BR> $json = $json['result'];<BR> if (! empty($json))<BR> {<BR> $jsonresult = json_decode($json, true);<BR> foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v)<BR> {<BR> $result['price'][$v['minqty']] = $v['price'];<BR> }<BR> }<BR> }<br><br>// img_url<BR> $pattern = '/<div\s+id="part_image">[\s\n]*<img \s+src="(.[^"]+)"/isU';<BR alt="分享下页面关键字抓取components.arrow.com站点代码" > if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['img_url'] = trim($matches[1]);<BR> }<br><br>// page_url<BR> $result['page_url'] = $url;<br><br>return $result;<BR> }<br><br>/**<BR> * 最终调用函数<BR> * @param string $keywords 搜索关键字<BR> * @return array<BR> */<BR> function getData($keywords)<BR> {<BR> $hrefList = getListHrefAll($keywords);<BR> $result = array();<br><br>foreach ($hrefList as $k=>$v)<BR> {<BR> $result[] = getDetail($v);<BR> }<br><br>return $result;<BR> }<br><br>// Test Script<BR> $keywords = trim($_GET['keywords']);<BR> $result = getData($keywords);<br><br>print_r($result);<BR> </script>
Related labels:
source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Popular Recommendations
Popular Tutorials
More>
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template