Teilen Sie zehn Methoden zum Crawlen von Webressourcen mit py3-Python-Tutorial-php.cn

Teilen Sie zehn Methoden zum Crawlen von Webressourcen mit py3

Y2J

Freigeben： 2017-05-11 11:04:14

Original

2964 Leute haben es durchsucht

In den letzten zwei Tagen habe ich gelernt, wie man Python3 zum Crawlen von Webressourcen verwendet, und viele Methoden gefunden, daher werde ich heute ein paar Anmerkungen hinzufügen.

1. Die einfachste

import urllib.request
response = urllib.request.urlopen(&#39;http://python.org/&#39;)
html = response.read()

Nach dem Login kopieren

2. Verwendungsanfrage

import urllib.request
 
req = urllib.request.Request(&#39;http://python.org/&#39;)
response = urllib.request.urlopen(req)
the_page = response.read()

Nach dem Login kopieren

3. Daten und Header senden

#! /usr/bin/env python3
 
import urllib.parse
import urllib.request
 
url = &#39;http://localhost/login.php&#39;
user_agent = &#39;Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)&#39;
values = {
     &#39;act&#39; : &#39;login&#39;,
     &#39;login[email]&#39; : &#39;yzhang@i9i8.com&#39;,
     &#39;login[password]&#39; : &#39;123456&#39;
     }
 
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header(&#39;Referer&#39;, &#39;http://www.python.org/&#39;)
response = urllib.request.urlopen(req)
the_page = response.read()
 
print(the_page.decode("utf8"))

Nach dem Login kopieren

5. Ausnahmebehandlung 1

#! /usr/bin/env python3
 
import urllib.parse
import urllib.request
 
url = &#39;http://localhost/login.php&#39;
user_agent = &#39;Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)&#39;
values = {
     &#39;act&#39; : &#39;login&#39;,
     &#39;login[email]&#39; : &#39;yzhang@i9i8.com&#39;,
     &#39;login[password]&#39; : &#39;123456&#39;
     }
headers = { &#39;User-Agent&#39; : user_agent }
 
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()
 
print(the_page.decode("utf8"))

Nach dem Login kopieren

7 >

8. HTTP-Authentifizierung

#! /usr/bin/env python3
 
import urllib.request
 
req = urllib.request.Request(&#39;http://www.python.org/fish.html&#39;)
try:
  urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
  print(e.code)
  print(e.read().decode("utf8"))

Nach dem Login kopieren

9. Verwendung von Proxy

#! /usr/bin/env python3
 
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request("http://twitter.com/")
try:
  response = urlopen(req)
except HTTPError as e:
  print(&#39;The server couldn\&#39;t fulfill the request.&#39;)
  print(&#39;Error code: &#39;, e.code)
except URLError as e:
  print(&#39;We failed to reach a server.&#39;)
  print(&#39;Reason: &#39;, e.reason)
else:
  print("good!")
  print(response.read().decode("utf8"))

Nach dem Login kopieren

[Verwandte Empfehlungen]

#! /usr/bin/env python3
 
from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request("http://twitter.com/")
try:
  response = urlopen(req)
except URLError as e:
  if hasattr(e, &#39;reason&#39;):
    print(&#39;We failed to reach a server.&#39;)
    print(&#39;Reason: &#39;, e.reason)
  elif hasattr(e, &#39;code&#39;):
    print(&#39;The server couldn\&#39;t fulfill the request.&#39;)
    print(&#39;Error code: &#39;, e.code)
else:
  print("good!")
  print(response.read().decode("utf8"))

Nach dem Login kopieren

Python-Kostenloses Video-Tutorial

#! /usr/bin/env python3
 
import urllib.request
 
# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
 
# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = "https://cms.tetx.com/"
password_mgr.add_password(None, top_level_url, &#39;yzhang&#39;, &#39;cccddd&#39;)
 
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
 
# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)
 
# use the opener to fetch a URL
a_url = "https://cms.tetx.com/"
x = opener.open(a_url)
print(x.read())
 
# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)
 
a = urllib.request.urlopen(a_url).read().decode(&#39;utf8&#39;)
print(a)

Nach dem Login kopieren

#! /usr/bin/env python3
 
import urllib.request
 
proxy_support = urllib.request.ProxyHandler({&#39;sock5&#39;: &#39;localhost:1080&#39;})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

 
a = urllib.request.urlopen("http://g.cn").read().decode("utf8")
print(a)

Nach dem Login kopieren

Python-Lernhandbuch

#! /usr/bin/env python3
 
import socket
import urllib.request
 
# timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)
 
# this call to urllib.request.urlopen now uses the default timeout
# we have set in the socket module
req = urllib.request.Request(&#39;http://twitter.com/&#39;)
a = urllib.request.urlopen(req).read()
print(a)

Nach dem Login kopieren

Die vollständige Erklärung der Python-Grundgrammatik Video

Das obige ist der detaillierte Inhalt vonTeilen Sie zehn Methoden zum Crawlen von Webressourcen mit py3. Für weitere Informationen folgen Sie bitte anderen verwandten Artikeln auf der PHP chinesischen Website!