|
爬取超清美图python爬虫学习成品和源码
功能说明:
1.批量爬取高清图
2.支持自定义cookie爬取超清图(需要有会员,不然每天只能爬成功一张)
演示视频:
https://www.bilibili.com/video/BV1Cb4y1Q73j/
代码:
- import requests, bs4, time, json, os, urllib, time
-
-
- class NetbiAn():
- def __init__(self, cookie=None):
- '''
- 彼岸图库
- :param cookie: 自定义cookie,不填就默认我的cookie
- '''
-
- self.url = 'https://pic.netbian.com/'
- if cookie == None:
- self.headers = {
- 'cookie': '__yjs_duid=1_5497b819a72afc9101dd25f2d5726a8e1616818734114; __guid=216607383.3773875649706524700.1616818734566.507; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1616818735; zkhanecookieclassrecord=%2C54%2C66%2C; PHPSESSID=varai3ubq9gf8ri9vpb7ppsrm5; zkhanmlusername=%B7%B2%CA%C2%BF%B4%C8%BA%CE%C4%BC%FE; zkhanmluserid=4729080; zkhanmlgroupid=1; zkhanmlrnd=zkIOXHSZ8ya4GKjTuTxA; zkhanmlauth=0b23850ea3f69277fce68255bf7e776c; ',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
- 'x-requested-with': 'XMLHttpRequest'}
- else:
- self.headers = {
- 'cookie': cookie,
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
- 'x-requested-with': 'XMLHttpRequest'}
-
- def get_HDimg_url(self, id):
- '''
- 获取超清图片的下载地址
- :param id:
- :return:
- '''
- api_url = 'https://pic.netbian.com/e/extend/downpic.php'
- params = {'id': id}
- res = requests.get(api_url, headers=self.headers, params=params)
-
- js = json.loads(res.text)
- return self.url + js['pic']
-
- def download_img(self, url, path):
- '''
- 下载文件
- :param url:地址
- :param path: 保存文件名
- :return:
- '''
- try:
- res = requests.get(url, headers=self.headers)
- with open(path, 'ab')as f:
- f.write(res.content)
- print(path, '下载成功!')
- except Exception as err:
- print(err, '下载失败!')
-
- def get_img_list(self, url='https://pic.netbian.com/new/', max_page=1, HD='1'):
- '''
- 根据输入的目标类目url来爬取所属的图片,比如最新的的[url=https://pic.netbian.com/new/]https://pic.netbian.com/new/[/url]
- :param url:目标类目的url
- :param max_page:爬取多少页 默认为1
- :param HD:是否爬取超清的图片,注意了这个需要您有会员,不然一天只能爬1张..没啥意义默认为False
- :return:返回一个包含所有图片下载地址和name的列表
- '''
- img_list = []
- #如果有人输入了这种带页码的网站则处理一下
- if url.find('index_')!=-1:
- url=url.split('index_')[0]
-
- try:
- for page in range(max_page):
- print(f'正在获取{page+1}页的 图片资料...')
- if page+1>=2:
- res = requests.get(url+f'index_{page+1}.html', headers=self.headers)
- else:
- res = requests.get(url, headers=self.headers)
- res.encoding = 'gbk'
- bson = bs4.BeautifulSoup(res.text, 'lxml')
- bson = bson.select('#main > div.slist > ul > li')
-
- for item in bson:
- d = {}
- d['href'] = self.url + item.a['href']
- res = requests.get(d['href'], headers=self.headers)
- res.encoding = 'gbk'
- bs = bs4.BeautifulSoup(res.text, 'lxml')
- bs = bs.select_one('#img > img')
- img_id = d['href'].split('/')[-1].split('.')[0]
- if HD == '1':
-
- d['src'] = self.url+bs['src']
- else:
- # 超清
- d['src'] = self.get_HDimg_url(img_id)
- d['title'] = item.img['alt']
- d['name'] = img_id + '_' + d['title'].replace(' ', '_') + '.' + \
- d['src'].split('.')[-1]
- img_list.append(d)
- except Exception as err:
- print(err)
- return img_list
-
- def download_batch(self, img_list, dir_, tt=0.2):
- '''
- 把整个列表的图都下载下来,因为有限制,所以没必要搞线程
- :param img_list: 图片的列表数据
- :param dir: 保存目录位置
- :param tt: 每次下载等待时间 默认为0.2s
- :return:
- '''
- length=len(img_list)
- if length==0:
- print('您不是会员,或者已经被限制了!')
- return
- print(f'一共有{length}个下载任务...')
- try:
- os.mkdir(dir_)
- except:
- pass
- for item in img_list:
- path = dir_ + '\\' + item['name']
- self.download_img(item['src'], path)
- time.sleep(tt)
-
-
- if __name__ == '__main__':
- print('本脚本目标网站为:[url]https://pic.netbian.com/'[/url],'仅供技术交流,请勿用户违法或者商业用途,否则后果自负!')
- print('建议:(30元年会可以自己登入后将cookie粘贴到下方,这样就能每天至少能爬取200张超清图10页,而1块钱7天的会员就每天20张,只能爬一页)')
- key = input('回车确认cookie身份 如果需要自定义cookie可以直接输入 留空也可以:\n')
- if len(key) > 5:
- bah = NetbiAn(key)
- else:
- bah = NetbiAn()
- HD = input('是否下载超清画质(如果有会员可以填2,不是会员就填1):\n1.普通画质\n2.超清画质\n')
- type_url = input('输入目标类目的url,如果不填默认为:[url=https://pic.netbian.com/new/]https://pic.netbian.com/new/[/url]\n')
- max_page=int(input('爬取多少页?\n'))
- if type_url=='':
- img_list = bah.get_img_list(max_page=max_page,HD=HD)
- else:
- img_list=bah.get_img_list(type_url,max_page,HD)
-
- bah.download_batch(img_list,'img')
- input('所有任务结束!')
复制代码
|
|