爬取超清美图python爬虫学习成品和源码

阳光电脑1987 · 发表于 2021-3-30 14:17:52

功能说明:
1.批量爬取高清图
2.支持自定义cookie爬取超清图(需要有会员,不然每天只能爬成功一张)

演示视频:
https://www.bilibili.com/video/BV1Cb4y1Q73j/
代码:
[Python] 纯文本查看复制代码import requests, bs4, time, json, os, urllib, timeclass NetbiAn(): def __init__(self, cookie=None): ''' 彼岸图库 :param cookie: 自定义cookie,不填就默认我的cookie ''' self.url = 'https://pic.netbian.com/' if cookie == None: self.headers = { 'cookie': '__yjs_duid=1_5497b819a72afc9101dd25f2d5726a8e1616818734114; __guid=216607383.3773875649706524700.1616818734566.507; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1616818735; zkhanecookieclassrecord=%2C54%2C66%2C; PHPSESSID=varai3ubq9gf8ri9vpb7ppsrm5; zkhanmlusername=%B7%B2%CA%C2%BF%B4%C8%BA%CE%C4%BC%FE; zkhanmluserid=4729080; zkhanmlgroupid=1; zkhanmlrnd=zkIOXHSZ8ya4GKjTuTxA; zkhanmlauth=0b23850ea3f69277fce68255bf7e776c; ', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'x-requested-with': 'XMLHttpRequest'} else: self.headers = { 'cookie': cookie, 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36', 'x-requested-with': 'XMLHttpRequest'} def get_HDimg_url(self, id): ''' 获取超清图片的下载地址 :param id: :return: ''' api_url = 'https://pic.netbian.com/e/extend/downpic.php' params = {'id': id} res = requests.get(api_url, headers=self.headers, params=params) js = json.loads(res.text) return self.url + js['pic'] def download_img(self, url, path): ''' 下载文件 :param url:地址 :param path: 保存文件名 :return: ''' try: res = requests.get(url, headers=self.headers) with open(path, 'ab')as f: f.write(res.content) print(path, '下载成功！') except Exception as err: print(err, '下载失败！') def get_img_list(self, url='https://pic.netbian.com/new/', max_page=1, HD='1'): ''' 根据输入的目标类目url来爬取所属的图片，比如最新的的https://pic.netbian.com/new/ :param url:目标类目的url :param max_page:爬取多少页默认为1 :param HD:是否爬取超清的图片,注意了这个需要您有会员,不然一天只能爬1张..没啥意义默认为False :return:返回一个包含所有图片下载地址和name的列表 ''' img_list = [] #如果有人输入了这种带页码的网站则处理一下 if url.find('index_')!=-1: url=url.split('index_')[0] try: for page in range(max_page): print(f'正在获取{page+1}页的图片资料...') if page+1>=2: res = requests.get(url+f'index_{page+1}.html', headers=self.headers) else: res = requests.get(url, headers=self.headers) res.encoding = 'gbk' bson = bs4.BeautifulSoup(res.text, 'lxml') bson = bson.select('#main > div.slist > ul > li') for item in bson: d = {} d['href'] = self.url + item.a['href'] res = requests.get(d['href'], headers=self.headers) res.encoding = 'gbk' bs = bs4.BeautifulSoup(res.text, 'lxml') bs = bs.select_one('#img > img') img_id = d['href'].split('/')[-1].split('.')[0] if HD == '1': d['src'] = self.url+bs['src'] else: # 超清 d['src'] = self.get_HDimg_url(img_id) d['title'] = item.img['alt'] d['name'] = img_id + '_' + d['title'].replace(' ', '_') + '.' + \ d['src'].split('.')[-1] img_list.append(d) except Exception as err: print(err) return img_list def download_batch(self, img_list, dir_, tt=0.2): ''' 把整个列表的图都下载下来，因为有限制，所以没必要搞线程 :param img_list: 图片的列表数据 :param dir: 保存目录位置 :param tt: 每次下载等待时间默认为0.2s :return: ''' length=len(img_list) if length==0: print('您不是会员,或者已经被限制了!') return print(f'一共有{length}个下载任务...') try: os.mkdir(dir_) except: pass for item in img_list: path = dir_ + '\\' + item['name'] self.download_img(item['src'], path) time.sleep(tt)if __name__ == '__main__': print('本脚本目标网站为:https://pic.netbian.com/','仅供技术交流,请勿用户违法或者商业用途,否则后果自负!') print('建议:(30元年会可以自己登入后将cookie粘贴到下方，这样就能每天至少能爬取200张超清图10页，而1块钱7天的会员就每天20张,只能爬一页)') key = input('回车确认cookie身份如果需要自定义cookie可以直接输入留空也可以：\n') if len(key) > 5: bah = NetbiAn(key) else: bah = NetbiAn() HD = input('是否下载超清画质(如果有会员可以填2,不是会员就填1)：\n1.普通画质\n2.超清画质\n') type_url = input('输入目标类目的url,如果不填默认为:https://pic.netbian.com/new/\n') max_page=int(input('爬取多少页?\n')) if type_url=='': img_list = bah.get_img_list(max_page=max_page,HD=HD) else: img_list=bah.get_img_list(type_url,max_page,HD) bah.download_batch(img_list,'img') input('所有任务结束!')
下载地址:

游客，如果您要查看本帖隐藏内容请回复