爬取超清美图python爬虫学习成品和源码

令狐冲 · 发表于 2021-4-9 13:23:52

爬取超清美图python爬虫学习成品和源码

功能说明:
1.批量爬取高清图
2.支持自定义cookie爬取超清图(需要有会员,不然每天只能爬成功一张)

演示视频:
https://www.bilibili.com/video/BV1Cb4y1Q73j/
代码:

import requests, bs4, time, json, os, urllib, time
class NetbiAn():
def __init__(self, cookie=None):
'''
彼岸图库
:param cookie: 自定义cookie,不填就默认我的cookie
'''
self.url = 'https://pic.netbian.com/'
if cookie == None:
self.headers = {
'cookie': '__yjs_duid=1_5497b819a72afc9101dd25f2d5726a8e1616818734114; __guid=216607383.3773875649706524700.1616818734566.507; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1616818735; zkhanecookieclassrecord=%2C54%2C66%2C; PHPSESSID=varai3ubq9gf8ri9vpb7ppsrm5; zkhanmlusername=%B7%B2%CA%C2%BF%B4%C8%BA%CE%C4%BC%FE; zkhanmluserid=4729080; zkhanmlgroupid=1; zkhanmlrnd=zkIOXHSZ8ya4GKjTuTxA; zkhanmlauth=0b23850ea3f69277fce68255bf7e776c; ',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'}
else:
self.headers = {
'cookie': cookie,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'}
def get_HDimg_url(self, id):
'''
获取超清图片的下载地址
:param id:
:return:
'''
api_url = 'https://pic.netbian.com/e/extend/downpic.php'
params = {'id': id}
res = requests.get(api_url, headers=self.headers, params=params)
js = json.loads(res.text)
return self.url + js['pic']
def download_img(self, url, path):
'''
下载文件
:param url:地址
:param path: 保存文件名
:return:
'''
try:
res = requests.get(url, headers=self.headers)
with open(path, 'ab')as f:
f.write(res.content)
print(path, '下载成功！')
except Exception as err:
print(err, '下载失败！')
def get_img_list(self, url='https://pic.netbian.com/new/', max_page=1, HD='1'):
'''
根据输入的目标类目url来爬取所属的图片，比如最新的的[url=https://pic.netbian.com/new/]https://pic.netbian.com/new/[/url]
:param url:目标类目的url
:param max_page:爬取多少页默认为1
:param HD:是否爬取超清的图片,注意了这个需要您有会员,不然一天只能爬1张..没啥意义默认为False
:return:返回一个包含所有图片下载地址和name的列表
'''
img_list = []
#如果有人输入了这种带页码的网站则处理一下
if url.find('index_')!=-1:
url=url.split('index_')[0]
try:
for page in range(max_page):
print(f'正在获取{page+1}页的图片资料...')
if page+1>=2:
res = requests.get(url+f'index_{page+1}.html', headers=self.headers)
else:
res = requests.get(url, headers=self.headers)
res.encoding = 'gbk'
bson = bs4.BeautifulSoup(res.text, 'lxml')
bson = bson.select('#main > div.slist > ul > li')
for item in bson:
d = {}
d['href'] = self.url + item.a['href']
res = requests.get(d['href'], headers=self.headers)
res.encoding = 'gbk'
bs = bs4.BeautifulSoup(res.text, 'lxml')
bs = bs.select_one('#img > img')
img_id = d['href'].split('/')[-1].split('.')[0]
if HD == '1':
d['src'] = self.url+bs['src']
else:
# 超清
d['src'] = self.get_HDimg_url(img_id)
d['title'] = item.img['alt']
d['name'] = img_id + '_' + d['title'].replace(' ', '_') + '.' + \
d['src'].split('.')[-1]
img_list.append(d)
except Exception as err:
print(err)
return img_list
def download_batch(self, img_list, dir_, tt=0.2):
'''
把整个列表的图都下载下来，因为有限制，所以没必要搞线程
:param img_list: 图片的列表数据
:param dir: 保存目录位置
:param tt: 每次下载等待时间默认为0.2s
:return:
'''
length=len(img_list)
if length==0:
print('您不是会员,或者已经被限制了!')
return
print(f'一共有{length}个下载任务...')
try:
os.mkdir(dir_)
except:
pass
for item in img_list:
path = dir_ + '\\' + item['name']
self.download_img(item['src'], path)
time.sleep(tt)
if __name__ == '__main__':
print('本脚本目标网站为:[url]https://pic.netbian.com/'[/url],'仅供技术交流,请勿用户违法或者商业用途,否则后果自负!')
print('建议:(30元年会可以自己登入后将cookie粘贴到下方，这样就能每天至少能爬取200张超清图10页，而1块钱7天的会员就每天20张,只能爬一页)')
key = input('回车确认cookie身份如果需要自定义cookie可以直接输入留空也可以：\n')
if len(key) > 5:
bah = NetbiAn(key)
else:
bah = NetbiAn()
HD = input('是否下载超清画质(如果有会员可以填2,不是会员就填1)：\n1.普通画质\n2.超清画质\n')
type_url = input('输入目标类目的url,如果不填默认为:[url=https://pic.netbian.com/new/]https://pic.netbian.com/new/[/url]\n')
max_page=int(input('爬取多少页?\n'))
if type_url=='':
img_list = bah.get_img_list(max_page=max_page,HD=HD)
else:
img_list=bah.get_img_list(type_url,max_page,HD)
bah.download_batch(img_list,'img')
input('所有任务结束!')

复制代码

bhbhbh · 发表于 2021-4-10 06:16:24

谢楼主提供！

[[Windows]] 爬取超清美图python爬虫学习成品和源码

最佳新人

活跃会员

灌水天才

新人进步

宝藏新人进步勋章

个人中心