Heim > php教程 > php手册 > Hauptteil

分享下页面关键字抓取components.arrow.com站点代码

WBOY
Freigeben: 2016-06-13 09:44:04
Original
883 Leute haben es durchsucht

复制代码 代码如下:


 /**
 * HOST: components.arrow.com
 */
 //set_time_limit(0);
 // base function
 function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')
 {
 $ch = curl_init();
 if (!empty($data)) {
 $data = is_array($data)?http_build_query($data): $data;
 $url .= (strpos($url,'?')? '&': "?") . $data;
 }
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_POST, 0);
 curl_setopt($ch, CURLOPT_PORT, $port);
 curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面
 $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);
 if($proxy) {
 curl_setopt($ch, CURLOPT_PROXY, $proxy);
 curl_setopt($ch, CURLOPT_PROXYPORT, 1723);
 curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");
 }

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:\n" . curl_error($ch);

}
 curl_close($ch);
 return $result;
 }

复制代码 代码如下:


function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80)
 {
 $ch = curl_init();
 curl_setopt($ch, CURLOPT_URL, $url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
 curl_setopt($ch, CURLOPT_PORT, $port);
 !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 curl_setopt($ch, CURLOPT_POST, 1);
 curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

$result = array();
 $result['result'] = curl_exec($ch);
 if (0 != curl_errno($ch)) {
 $result['error'] = "Error:\n" . curl_error($ch);

}
 curl_close($ch);

return $result;
 }

/**
 * 获取列表页的html源码
 * @param string $keywords 搜索关键字
 * @param int $start 开始记录数
 * @return boolean|array
 */
 function getListHtml($keywords, $start = 0)
 {
 if ($start  {
 return false;
 }

$postData = array(
 'search_token' => $keywords,
 'start' => $start,
 'limit' => 100,
 );

$result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData));
 if ( isset($result['error']) )
 {
 return false;
 //exit($result['error']);
 }
 $result = $result['result'];

return $result;
 }

/**
 * 获取列表页 连接href
 * @param string $html html源码
 * @return array
 */
 function getListHref($html)
 {
 $pattern = '/

]+)">/isU';
 if (preg_match_all($pattern, $html, $matches))
 {
 return $matches[1];
 } else {
 // 没有匹配项
 return array();
 }
 }

/**
 * 获取下一页数字start
 * @param string $html html源码
 * @return number
 */
 function getListNextPage($html)
 {
 $pattern = '/<script>buildPagination\(\'\d+\',\'\d+\',\'(\d+)\',\d+\);<\/script>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> return intval($matches[1]);<BR> } else {<BR> return -1;<BR> }<BR> }<br><br>/**<BR> * 获取列表也所有的详细列表<BR> * @param string $keywords 搜索关键字<BR> * @return boolean|array<BR> */<BR> function getListHrefAll($keywords)<BR> {<BR> if (empty($keywords))<BR> {<BR> return false;<BR> }<br><br>$html = getListHtml($keywords);<BR> $hrefList = getListHref($html);<BR> if (empty($hrefList))<BR> {<BR> // 没有结果<BR> return array();<BR> }<BR> $nextPage = getListNextPage($html);<BR> $loop =0;<BR> while ($nextPage > 0)<BR> {<BR> $html = getListHtml($keywords, $nextPage);<BR> $tmpHrefList = getListHref($html);<BR> $hrefList = array_merge($hrefList, $tmpHrefList);<BR> $nextPage = getListNextPage($html);<BR> $loop ++;<BR> }<BR> return $hrefList;<BR> }<br><br>/**<BR> * 获取详情页信息<BR> * @param string $url url地址<BR> * @return array()<BR> */<BR> function getDetail($url)<BR> {<BR> if ( empty($url) )<BR> {<BR> return false;<BR> }<BR> $host = 'http://components.arrow.com';<br><br>$url = $host . $url;<BR> $result = curl_get($url);<BR> if ( isset($result['error']) )<BR> {<BR> return array();<BR> //exit($result['error']);<BR> }<BR> $html = $result['result'];<br><br>$result = array(<BR> 'sup_part' => '', // 供应商型<BR> 'sup_id' => '', // 供应商ID<BR> 'mfg_part' => '', // 制造商型号<BR> 'mfg_name' => '', // 制造商名称<BR> 'cat_name' => '', // 分类名称<BR> 'para' => '', // 属性<BR> 'desc' => '', // 描述<BR> 'pdf_url' => '', // PDF地址<BR> 'sup_stock' => '', // 库存<BR> 'min_purch' => '', // 最小订购量<BR> 'price' => '', // 价格<BR> 'img_url' => '', // 图片地址<BR> 'createtime' => '', // 创建时间<BR> 'datacode' => '', // 批号<BR> 'package' => '', // 封装<BR> 'page_url' => '', // 页面地址<BR> );<br><br>// mfg_part<BR> $pattern = '/<li>[\s\n]*<strong>Part No:\s*<\/strong>(.+)<\/li>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['mfg_part'] = trim($matches[1]);<BR> } else {file_put_contents('page.txt', $html);die('xxx');<BR> return array();<BR> }<br><br>// mfg_name<BR> $pattern = '/<li>[\s\n]*<strong>Manufacturer: <\/strong>(.+)<\/li>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['mfg_name'] = trim($matches[1]);<BR> }<br><br>// cat_name<BR> $pattern = '/displayCategory\(\'(.[^\']+)\'\);/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['cat_name'] = trim($matches[1]);<BR> $result['cat_name'] = str_replace('|', '>', $result['cat_name']);<BR> }<br><br>// para<BR> $tablepattern = '/<table\s+id="part_specs".[^>]*>(.+)<\/table>/isU';<BR> if (preg_match($tablepattern, $html, $matches))<BR> {<BR> $pattern = '/<tr>[\s\n]*<td><strong>(.+)<\/strong><\/td><td>(.+)<\/td>[\s\n]*<\/tr>/isU';<BR> if (preg_match_all($pattern, $matches[1], $matches))<BR> {<BR> foreach($matches[1] as $k=>$v)<BR> {<BR> $v = trim($v);<BR> if ('Package Type' == $v)<BR> {<BR> $result['package'] = trim($matches[2][$k]);<BR> continue;<BR> }<BR> $result['para'][$v] = trim($matches[2][$k]);<BR> }<BR> }<BR> }<br><br>// desc<BR> $pattern = '/<div\s+id="part_title">.+<h4>(.+)<\/h4>[\s\n]*<\/div>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['desc'] = trim($matches[1]);<BR> }<br><br>// pdf_url<BR> $pattern = '/<li\s+class="datasheet">[\s\n]*<strong>Datasheet:<\/strong><a\s+href="(.[^"]+)"/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['pdf_url'] = $host . trim($matches[1]);<BR> }<br><br>// sup_stock<BR> $pattern = '/<td\s+id="inv_1"\s+class="li_inv">([\d,]+)<\/td>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['sup_stock'] = trim($matches[1]);<BR> $result['sup_stock'] = str_replace(',', '', $result['sup_stock']);<BR> }<br><br>// min_purch<BR> $pattern = '/<span\s+id="multiples">[\s\n]*<strong>Multiple:\s*<\/strong>(.+)<\/span>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['min_purch'] = trim($matches[1]);<BR> }<br><br>// price<BR> $pattern = '/<div\s+id="price_1"\s+class="li_price">(.[^<]+)<\/div>/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['price'][1] = trim($matches[1]);<BR> }<BR> $pattern = '/<div\s+id="price_1"\s+class="li_price">[\s\n]*<span.[^>]+title="(.[^"]+)">/isU';<BR> if (preg_match($pattern, $html, $matches))<BR> {<BR> $priceurl = str_replace('&', '&', $matches[1]);<BR> $json = curl_get($priceurl);<BR> $json = $json['result'];<BR> if (! empty($json))<BR> {<BR> $jsonresult = json_decode($json, true);<BR> foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v)<BR> {<BR> $result['price'][$v['minqty']] = $v['price'];<BR> }<BR> }<BR> }<br><br>// img_url<BR> $pattern = '/<div\s+id="part_image">[\s\n]*<img \s+src="(.[^"]+)"/isU';<BR alt="分享下页面关键字抓取components.arrow.com站点代码" > if (preg_match($pattern, $html, $matches))<BR> {<BR> $result['img_url'] = trim($matches[1]);<BR> }<br><br>// page_url<BR> $result['page_url'] = $url;<br><br>return $result;<BR> }<br><br>/**<BR> * 最终调用函数<BR> * @param string $keywords 搜索关键字<BR> * @return array<BR> */<BR> function getData($keywords)<BR> {<BR> $hrefList = getListHrefAll($keywords);<BR> $result = array();<br><br>foreach ($hrefList as $k=>$v)<BR> {<BR> $result[] = getDetail($v);<BR> }<br><br>return $result;<BR> }<br><br>// Test Script<BR> $keywords = trim($_GET['keywords']);<BR> $result = getData($keywords);<br><br>print_r($result);<BR> </script>
Verwandte Etiketten:
Quelle:php.cn
Erklärung dieser Website
Der Inhalt dieses Artikels wird freiwillig von Internetnutzern beigesteuert und das Urheberrecht liegt beim ursprünglichen Autor. Diese Website übernimmt keine entsprechende rechtliche Verantwortung. Wenn Sie Inhalte finden, bei denen der Verdacht eines Plagiats oder einer Rechtsverletzung besteht, wenden Sie sich bitte an admin@php.cn
Beliebte Empfehlungen
Beliebte Tutorials
Mehr>
Neueste Downloads
Mehr>
Web-Effekte
Quellcode der Website
Website-Materialien
Frontend-Vorlage