之前写过curl批处理采集数据,这里贴上完整版本,代码很简单,废话不说,上代码,新手欢迎指教!!!
代码只写到 获取到链接了,至于排名 后边数组的键不就是排名喽。。。
<span> 1</span> <?<span>php </span><span> 2</span> <span>/*</span><span>* </span><span> 3</span> <span> * Based on yahoo access to data </span><span> 4</span> <span> * </span><span> 5</span> <span> * @author chujiu <527891885@qq.com> </span><span> 6</span> <span> * @copyright 2014.04.26 By chujiu </span><span> 7</span> <span> * @version 0.2.1 2014.04.26 </span><span> 8</span> <span>*/</span> <span> 9</span> <span> 10</span> <span>class</span><span> DataCollectionRank { </span><span> 11</span> <span> 12</span> <span>const</span> PAGE = 10<span>; </span><span> 13</span> <span>public</span> <span>$path</span> = ''<span>; </span><span> 14</span> <span>public</span> <span>$main</span> = 91<span>; </span><span> 15</span> <span> 16</span> <span>//</span><span> 添加curl句柄 返回资源</span> <span> 17</span> <span>private</span> <span>function</span> _gather_data(<span>$keyword</span><span>) { </span><span> 18</span> <span>if</span>(<span>empty</span>(<span>$keyword</span><span>)) { </span><span> 19</span> <span>return</span> ''<span>; </span><span> 20</span> <span> } </span><span> 21</span> <span>$chs</span> = <span>array</span>(); <span>//</span><span> 句柄</span> <span> 22</span> <span>$mh</span> =<span> curl_multi_init(); </span><span> 23</span> <span>for</span>( <span>$i</span>=1; <span>$i</span><=<span>$this</span>->main; <span>$i</span>+=self::<span>PAGE ) { </span><span> 24</span> <span>$url</span> = 'http://search.yahoo.co.jp/search?p='.<span>urlencode</span>(<span>$keyword</span>).'&tid=top_ga1_sa&ei=UTF-8&aq=-1&oq='.<span>urlencode</span>(<span>$keyword</span>).'&pstart=1&fr=top_ga1_sa&b='.<span>$i</span><span>; </span><span> 25</span> <span>$ch</span> =<span> curl_init(); </span><span> 26</span> <span>//</span><span>设置选项</span> <span> 27</span> curl_setopt_array(<span>$ch</span>, <span>array</span><span>( </span><span> 28</span> CURLOPT_URL => <span>$url</span>, <span> 29</span> CURLOPT_HEADER => <span>false</span>, <span> 30</span> CURLOPT_SSL_VERIFYPEER => <span>false</span>, <span> 31</span> CURLOPT_RETURNTRANSFER => <span>true</span>, <span> 32</span> CURLOPT_TIMEOUT => 30, <span> 33</span> CURLOPT_AUTOREFERER => <span>true</span> <span> 34</span> <span> ) </span><span> 35</span> <span> ); </span><span> 36</span> curl_multi_add_handle(<span>$mh</span>, <span>$ch</span>); <span>//</span><span> 添加批处理句柄</span> <span> 37</span> <span>$chs</span>['handle'][<span>$i</span>]['ch'] = <span>$ch</span><span>; </span><span> 38</span> <span>$chs</span>['handle'][<span>$i</span>]['url'] = <span>$url</span><span>; </span><span> 39</span> <span> } </span><span> 40</span> <span>$chs</span>['mh'] = <span>$mh</span><span>; </span><span> 41</span> <span>return</span> <span>$chs</span><span>; </span><span> 42</span> <span> } </span><span> 43</span> <span> 44</span> <span>//</span><span> 处理CURL请求</span> <span> 45</span> <span>public</span> <span>function</span> exec_curl_get_data(<span>$keyword</span>, <span>$path</span><span>) { </span><span> 46</span> <span>$error</span> = ''<span>; </span><span> 47</span> <span>$this</span>->path = <span>$path</span><span>; </span><span> 48</span> <span>$chs</span> = <span>$this</span>->_gather_data(<span>$keyword</span><span>); </span><span> 49</span> <span>if</span>(<span>empty</span>(<span>$chs</span>)) <span>return</span> ''<span>; </span><span> 50</span> <span>//</span><span> 执行批处理句柄</span> <span> 51</span> <span>$active</span> = <span>null</span><span>; </span><span> 52</span> <span>do</span><span> { </span><span> 53</span> <span>$mrc</span> = curl_multi_exec(<span>$chs</span>['mh'],<span>$active</span><span>); </span><span> 54</span> <span>//</span><span>$info = curl_multi_info_read($chs['mh']);</span> <span> 55</span> } <span>while</span> (<span>$active</span> > 0<span>); </span><span> 56</span> <span>//</span><span> 获取数据</span> <span> 57</span> <span>$responses</span> = <span>array</span><span>(); </span><span> 58</span> <span>foreach</span>(<span>$chs</span>['handle'] <span>as</span> <span>$k</span>=><span>$ch</span><span>){ </span><span> 59</span> <span>if</span>(curl_error(<span>$ch</span>['ch'<span>])){ </span><span> 60</span> <span>$error</span> .= "\n".'error提示:'.curl_error(<span>$ch</span>['ch']).'-------URL:'.<span>$ch</span>['url'].'--------时间:'.<span>date</span>('Y-d-m H:i:s',<span>time</span>())."\n"<span>; </span><span> 61</span> } <span>else</span><span> { </span><span> 62</span> <span>$responses</span>[<span>$k</span>]['data'] = curl_multi_getcontent( <span>$ch</span>['ch'<span>] ); </span><span> 63</span> <span> } </span><span> 64</span> <span> 65</span> <span>//</span><span>curl_multi_info_read($mh); </span><span> 66</span> <span> // close current handler </span> <span> 67</span> curl_multi_remove_handle(<span>$chs</span>['mh'], <span>$ch</span>['ch'<span>]); </span><span> 68</span> curl_close(<span>$ch</span>['ch'<span>]); </span><span> 69</span> <span> } </span><span> 70</span> <span>//</span><span>关闭curl 批处理</span> <span> 71</span> curl_multi_close(<span>$chs</span>['mh'<span>]); </span><span> 72</span> <span>$str</span> = ''<span>; </span><span> 73</span> <span>if</span>(<span>$error</span> != ''<span>) { </span><span> 74</span> <span>$this</span>->_writeFile('get_rank_log.txt', <span>$error</span>, 'ab+'<span>); </span><span> 75</span> <span> } </span><span> 76</span> <span>foreach</span> (<span>$responses</span> <span>as</span> <span>$val</span><span>) { </span><span> 77</span> <span>if</span>(!<span>empty</span>(<span>$val</span>['data'<span>])) { </span><span> 78</span> <span>$str</span>.= <span>$this</span>->_get_keyword_link_preg(<span>$val</span>['data'<span>]); </span><span> 79</span> <span> } </span><span> 80</span> <span> } </span><span> 81</span> <span>$str</span> = <span>substr</span>(<span>$str</span>, 0 ,-1<span>); </span><span> 82</span> <span>$contents</span> = <span>explode</span>('|', <span>$str</span><span>); </span><span> 83</span> <span>return</span> <span>$contents</span><span>; </span><span> 84</span> <span> } </span><span> 85</span> <span> 86</span> <span>//</span><span> 过滤数据 获取链接</span> <span> 87</span> <span>private</span> <span>function</span> _get_keyword_link_preg (<span>$str</span><span>) { </span><span> 88</span> <span>$res</span> = ''<span>; </span><span> 89</span> <span>if</span>(<span>empty</span>(<span>$str</span><span>)) { </span><span> 90</span> <span>return</span> ''<span>; </span><span> 91</span> <span> } </span><span> 92</span> <span>$arr</span> = <span>explode</span>('<div id="web">', <span>$str</span><span>); </span><span> 93</span> <span>$arr1</span> = <span>explode</span>('<div id="posS" class="spns">', <span>$arr</span>[1<span>]); </span><span> 94</span> <span>$arr2</span> = <span>preg_replace</span>('#<div id=\"pg\">[\s\S]+#', '', <span>$arr1</span>[0<span>]); </span><span> 95</span> <span>$arr3</span> = <span>preg_replace</span>('#<div id=\"rel\">[\s\S]+#', '', <span>$arr2</span><span>); </span><span> 96</span> <span>$arr4</span> = <span>preg_replace</span>('#<em>[\s\S]+?</em>#', '', <span>$arr3</span><span>); </span><span> 97</span> <span>if</span>(<span>preg_match_all</span>('#href=\"(.*?)\">#',<span>$arr4</span>,<span>$arr5</span>) !== <span>false</span><span>) { </span><span> 98</span> <span>foreach</span>(<span>$arr5</span>[1] <span>as</span> <span>$val</span><span>) { </span><span> 99</span> <span>$res</span>.= <span>urldecode</span>(<span>$val</span>).'|'<span>; </span><span>100</span> <span> } </span><span>101</span> <span> } </span><span>102</span> <span>return</span> <span>$res</span><span>; </span><span>103</span> <span> } </span><span>104</span> <span>105</span> <span>//</span><span> 写入文件</span> <span>106</span> <span>public</span> <span>function</span> _writeFile(<span>$fileName</span>, <span>$data</span>, <span>$method</span>="rb+", <span>$iflock</span>=1, <span>$check</span>=1, <span>$chmod</span>=1<span>){ </span><span>107</span> <span>$check</span> && @<span>strpos</span>(<span>$this</span>->path.'/'.<span>$fileName</span>, '..')!==<span>false</span> && <span>exit</span>('403 Forbidden!'<span>); </span><span>108</span> @<span>touch</span>(<span>$this</span>->path.'/'.<span>$fileName</span><span>); </span><span>109</span> <span>$handle</span> = @<span>fopen</span>(<span>$this</span>->path.'/'.<span>$fileName</span>, <span>$method</span><span>); </span><span>110</span> <span>if</span>(<span>$iflock</span><span>) { </span><span>111</span> @<span>flock</span>(<span>$handle</span>,<span>LOCK_EX); </span><span>112</span> <span> } </span><span>113</span> <span>$fw</span> = @<span>fwrite</span>(<span>$handle</span>,<span>$data</span><span>); </span><span>114</span> <span>if</span>(<span>$method</span> == "rb+") <span>ftruncate</span>(<span>$handle</span>, <span>strlen</span>(<span>$data</span><span>)); </span><span>115</span> <span>fclose</span>(<span>$handle</span><span>); </span><span>116</span> <span>$chmod</span> && @<span>chmod</span>(<span>$this</span>->path.'/'.<span>$fileName</span>,0777<span>); </span><span>117</span> <span> } </span><span>118</span> <span>} </span><span>119</span> ?>
<span> 1</span> <span>function</span> array_unique_fb(<span>$array</span><span>){ </span><span> 2</span> <span>$temp</span> = <span>array</span><span>(); </span><span> 3</span> <span>$data</span> = <span>array</span><span>(); </span><span> 4</span> <span>foreach</span> (<span>$array</span> <span>as</span> <span>$value</span><span>){ </span><span> 5</span> <span>$value</span> = <span>join</span>(",",<span>$value</span>); <span>//</span><span>降维,也可以用implode,将一维数组转换为用逗号连接的字符串</span> <span> 6</span> <span>$temp</span>[] = <span>$value</span><span>; </span><span> 7</span> <span> } </span><span> 8</span> <span>$temp</span> = <span>array_flip</span>(<span>array_flip</span>(<span>$temp</span>)); <span>//</span><span>去掉重复的字符串,也就是重复的一维数组</span> <span> 9</span> <span>foreach</span> (<span>$temp</span> <span>as</span> <span>$k</span> => <span>$value</span><span>){ </span><span>10</span> <span>$temp</span>[<span>$k</span>] = <span>explode</span>(",",<span>$value</span>); <span>//</span><span>再将拆开的数组重新组装</span> <span>11</span> <span> } </span><span>12</span> <span>foreach</span> (<span>$temp</span> <span>as</span> <span>$key</span> => <span>$value</span><span>) { </span><span>13</span> <span>$data</span>[<span>$key</span>]['keyword'] = <span>$value</span>[0<span>]; </span><span>14</span> <span>$data</span>[<span>$key</span>]['domain'] = <span>$value</span>[1<span>]; </span><span>15</span> <span> } </span><span>16</span> <span>return</span> <span>$data</span><span>; </span><span>17</span> }