如何用python写一个爬虫,下载某论坛的动漫图片
实现过程
- 获取网页内容
import urllib.request
import re
import json
def getHttpStatusCode(tempUrl):
"""检验url是否可以正常访问"""
opener = urllib.request.build_opener()
opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
try:
opener.open(tempUrl)
# print(tempUrl+'没问题')
return 0
except urllib.error.HTTPError:
print(tempUrl + "=访问页面出错")
return 1
except urllib.error.URLError:
print(tempUrl + "=访问页面出错")
return 2
def getcontent(url):
"""获取链接html内容"""
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
data = response.read().decode("utf-8")
return data
- 分析内容中的信息,提取需要的链接
def getcatalogarray(content):
"""获取4chan - catalog数据 ---var catalog = {......};var style_group---"""
pattern = r"var\scatalog\s=(.*).var\sstyle_group"
res = re.search(pattern, content, re.M | re.S)
return res.group(1)
- 获取网页中的图片下载链接
def getimageurls(content):
"""获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg" """
pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
res = re.findall(pattern, content)
return res
- 下载图片
urllib.request.urlretrieve(
image_download_url, img_save_path
)
完整代码
tools.py
# -*- coding=utf-8 -*-
import urllib.request
import re
import json
"""
http工具
"""
def getcontent(url):
"""获取链接html内容"""
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64)"}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
data = response.read().decode("utf-8")
return data
def savestr2file(filename, content):
"""存储字符串到文件"""
output = open(filename, "w+", encoding="utf8")
output.write(content)
output.close()
def getcatalogarray(content):
"""获取4chan - catalog数据 ---var catalog = {......};var style_group---"""
pattern = r"var\scatalog\s=(.*).var\sstyle_group"
res = re.search(pattern, content, re.M | re.S)
return res.group(1)
def getimageurls(content):
"""获取网页中的图片链接 <a href="//i.4cdn.org/a/1553711154845s.jpg" """
pattern = r"a\shref=\"(//i.4cdn.org/.*?\.[j|p|g][p|n|i][g|f])\""
res = re.findall(pattern, content)
return res
# 检验url是否可以正常访问
def getHttpStatusCode(tempUrl):
"""检验url是否可以正常访问"""
opener = urllib.request.build_opener()
opener.addheaders = [("User-agent", "Mozilla/49.0.2")]
try:
opener.open(tempUrl)
# print(tempUrl+'没问题')
return 0
except urllib.error.HTTPError:
print(tempUrl + "=访问页面出错")
return 1
except urllib.error.URLError:
print(tempUrl + "=访问页面出错")
return 2
def mkdir(path):
# 引入模块
import os
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
return True
else:
return False
main_a.py
# -*- coding=utf-8 -*-
# author vvyun
import re
import json
from tools import *
import os
# 板块名 4chan/a
basec = "a"
urlindexbase = "https://boards.4chan.org/" + basec + "/catalog"
urlthreadbase = "https://boards.4chan.org/" + basec + "/thread/"
# 获取首页html内容
content = getcontent(urlindexbase)
# 保存首页html内容到本地
# filename = "image/content_4chan_e.html"
# savestr2file(filename, content)
# 获取目录信息
catalog_index = getcatalogarray(content)
# 保存目录信息到本地
# filenameimg = "image/catalog_4chan_e.json"
# savestr2file(filenameimg, catalog_index)
# 获取threads信息
catalog_threads = json.loads(catalog_index)["threads"]
# 循环获取每个thread的所有图片
for thread_url in catalog_threads:
print(urlthreadbase + thread_url)
# 获取thread页面html内容
content_thread = getcontent(urlthreadbase + thread_url)
# 保存thread页面html内容
# filename = "image/html/" + thread_url + ".html"
# savestr2file(filename, content_thread)
# 获取thread页面图片链接
imagedata = getimageurls(content_thread)
# 保存路径
img_save_basepath = "image/data/" + thread_url + "/"
# 创建保存目录
mkdir(img_save_basepath)
for image_url in imagedata:
# 下载链接
imd = "https:" + image_url
print(imd)
try:
if getHttpStatusCode(imd) < 1:
urllib.request.urlretrieve(
imd, img_save_basepath + image_url.replace("/", "")
)
except Exception as e:
raise e
github : https://github.com/nanonin/cs-note/tree/main/python/crawler/crawler4chan











网友评论