说明:urllib模块属于python自带模块,不需要下载,直接在.py文件头部引入使用
使用包
import urllib.request
语法1
urllib.request.urlopen(url, data, timeout)
- url:请求的路由;
- data:请求参数;
- timeout:过期时间。
例:# 只传入地址url的情况 url = 'http://www.baidu.com' response = urllib.request.urlopen(url) print(response) # read()读取响应内容,默认格式为bytes text = response.read().decode('utf-8') # 传入地址url和参数data的情况 # 参数放在url地址中,请求方式为GET url = 'http://www.baidu.com/s?wd=python' response = urllib.request.urlopen(url) text = response.read().decode('utf-8') # 参数放在urlopen(data)中,请求方式为POST url = 'http://www.baidu.com/s' data = { 'wd': 'python' } # urlencode()方法将字典转化为key=value的形式 data = urllib.parse.urlencode(data) # bytes()方法将data转化为bytes类型 data = bytes(data, encoding='utf-8') response = urllib.request.urlopen(url, data) text = response.read().decode('utf-8') # 传入timeout参数 url = 'http://www.baidu.com' try: response = urllib.request.urlopen(url, timeout=0.01) text = response.read().decode('utf-8') print(text) except urllib.error.URLError as e: print('超时')
语法2
request = urllib.request.Request(url, data, headers, method)
urllib.request.urlopen(obj)
例:
# request = urllib.request.Request(url, data, headers, method)
# urllib.request.urlopen(对象request)
# Request(url)只传入地址url的情况
url = 'https://movie.douban.com/top250'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
# Request(url, data)传入地址url和参数data的情况
# data是bytes类型,请求为POST
url = 'http://www.baidu.com/s'
data = {
'wd': 'python'
}
data = urllib.parse.urlencode(data)
data = bytes(data, encoding='utf-8')
request = urllib.request.Request(url, data)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
# Request(url, headers), headers表示请求头,如User_Agent参数
# User_Agent参数可被服务端获取进行判断,判断该请求为爬虫还是人工
url = 'http://httpbin.org/get'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
text = response.read().decode('utf-8')
print(text)
语法3
proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(proxy_handler)
res = openr.open(obj)
或 res = openr.open(url)
# 修改代理IP
url = 'http://httpbin.org/get'
proxies = {
'http': 'http://113.120.63.179:9999'
}
proxy_handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(proxy_handler)
# 设置User-Agent
# 设置User-Agent方法1
opener.addheaders = [('User-Agent', '')]
opener.open(url)
# 设置User-Agent方法2
request = urllib.request.Request(url, headers=headers)
response = opener.open(request)
text = response.read().decode('utf-8')
print(text)
网友评论