Python fragt Baidu-SEO-Informationen ab-Python-Tutorial-php.cn

Python fragt Baidu-SEO-Informationen ab

高洛峰

Freigeben： 2016-10-18 10:30:46

Original

1656 Leute haben es durchsucht

Eine einfache Python-Funktion zum Abfragen von Baidu-Keyword-Rankings, Funktionen:

1. UA Random

2. Einfache und bequeme Bedienung, einfach getRank (Keyword, Domainname)

3. Kodierungskonvertierung. Es sollte kein Problem mit der Codierung geben.

4. Reichhaltige Ergebnisse. Nicht nur das Ranking, sondern auch der Titel, die URL und die Snapshot-Zeit der Suchergebnisse erfüllen SEO-Anforderungen

Nachteile:

Einzelner Thread, langsam Geschwindigkeit

#coding=utf-8
  
import requests
import BeautifulSoup
import re
import random
  
def decodeAnyWord(w):
    try:
        w.decode(&#39;utf-8&#39;)
    except:
        w = w.decode(&#39;gb2312&#39;)
    else:
        w = w.decode(&#39;utf-8&#39;)
    return w
  
def createURL(checkWord):   #create baidu URL with search words
    checkWord = checkWord.strip()
    checkWord = checkWord.replace(&#39; &#39;, &#39;+&#39;).replace(&#39;\n&#39;, &#39;&#39;)
    baiduURL = &#39;http://www.baidu.com/s?wd=%s&rn=100&#39; % checkWord
    return baiduURL
  
def getContent(baiduURL):   #get the content of the serp
    uaList = [&#39;Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)&#39;,
    &#39;Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)&#39;,
    &#39;Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)&#39;]
    headers = {&#39;User-Agent&#39;: random.choice(uaList)}
    ipList = [&#39;202.43.188.13:8080&#39;,
    &#39;80.243.185.168:1177&#39;,
    &#39;218.108.85.59:81&#39;]
    proxies = {&#39;http&#39;: &#39;http://%s&#39; % random.choice(ipList)}
    r = requests.get(baiduURL, headers = headers, proxies = proxies)
    return r.content
  
def getLastURL(rawurl): #get final URL while there&#39;re redirects
    r = requests.get(rawurl)
    return r.url
  
def getAtext(atext):    #get the text with <a> and </a>
    pat = re.compile(r&#39;<a .*?>(.*?)</a>&#39;)
    match = pat.findall(atext)
    pureText = match[0].replace(&#39;<em>&#39;, &#39;&#39;).replace(&#39;</em>&#39;, &#39;&#39;)
    return pureText
  
def getCacheDate(t):    #get the date of cache
    pat = re.compile(r&#39;<span class="g">.*?(\d{4}-\d{1,2}-\d{1,2})  </span>&#39;)
    match = pat.findall(t)
    cacheDate = match[0]
    return cacheDate
  
def getRank(checkWord, domain): #main line
    checkWord = checkWord.replace(&#39;\n&#39;, &#39;&#39;)
    checkWord = decodeAnyWord(checkWord)
    baiduURL = createURL(checkWord)
    cont = getContent(baiduURL)
    soup = BeautifulSoup.BeautifulSoup(cont)
    results = soup.findAll(&#39;table&#39;, {&#39;class&#39;: &#39;result&#39;})    #find all results in this page
    for result in results:
        checkData = unicode(result.find(&#39;span&#39;, {&#39;class&#39;: &#39;g&#39;}))
        if re.compile(r&#39;^[^/]*%s.*?&#39; %domain).match(checkData): #改正则
            nowRank = result[&#39;id&#39;]  #get the rank if match the domain info
  
            resLink = result.find(&#39;h3&#39;).a
            resURL = resLink[&#39;href&#39;]
            domainURL = getLastURL(resURL)  #get the target URL
            resTitle = getAtext(unicode(resLink))   #get the title of the target page
  
            rescache = result.find(&#39;span&#39;, {&#39;class&#39;: &#39;g&#39;})
            cacheDate = getCacheDate(unicode(rescache)) #get the cache date of the target page
  
            res = u&#39;%s, 第%s名, %s, %s, %s&#39; % (checkWord, nowRank, resTitle, cacheDate, domainURL)
            return res.encode(&#39;gb2312&#39;)
            break
    else:
        return &#39;>100&#39;
  
domain = &#39;www.douban.com&#39; #set the domain which you want to search.
  
  
  
f = open(&#39;r.txt&#39;)
for w in f.readlines():
    print getRank(w, domain)
  
f.close()

Nach dem Login kopieren