- 注册时间
- 2017-4-7
- 最后登录
- 2017-6-23
- 回帖
- 2
- 主题
- 113
- 精华
- 2
- 金币
- 293
- 威望
- 0
- 股份
- 0
- 热心值
- 1
- 积分
- 307
宝藏初中生
- 回帖
- 2
- 金币
- 293
- 威望
- 0
- 积分
- 307
- 股份
- 0
- 热心值
- 1
- 宝藏币
- 0
|
小白第二次发帖,如有不当请及时删除!
这次我给大家带来的是 知乎热评爬虫
这是界面展示:
以下是爬取结果展示:
爬取的结果是excel,然后再评论里面有一些啥的,我没有删除,不是因为我懒得写代码,是因为里面还有图片链接,我不想动这个格式,如果你们有需要的话,完全可以自己再整理成文章。我这是保留了文本的原始性。
还有写注意事项都在软件上写了,有啥问题再问吧,实在是太困了。。
我认为我十一点就可以发出来,但是文件太大了,又就开始学虚拟环境打包。然后还有各种各样奇奇怪怪的问题。。。。
到现在4点了,狗命都快没了~~
源码:
[Python] 纯文本查看 复制代码# -*- coding:utf-8 -*-import requestsfrom lxml import etreeimport pandas as pdimport jsonfrom openpyxl import Workbookfrom openpyxl.utils.dataframe import dataframe_to_rowsimport datetimefrom PySide2.QtWidgets import QApplication, QMessageBox,QFileDialog,QHeaderView,QAbstractItemView,QTableWidgetItemfrom PySide2.QtUiTools import QUiLoaderimport threadingimport osfrom PySide2.QtGui import QIconclass Download: def __init__(self): self.ui = QUiLoader().load('知乎.ui') self.ui.cookies_button.clicked.connect(self.get_cookies) self.ui.catch_button.clicked.connect(self.find_hot) self.ui.save_button.clicked.connect(self.save2) self.cookies = 0000 self.df = 0000 self.ui.hot_list.horizontalHeader().resizeSection(0, 80) self.ui.hot_list.horizontalHeader().resizeSection(1, 400) # self.tableWidget.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) self.ui.hot_list.setEditTriggers(QAbstractItemView.NoEditTriggers) self.hot_list = 0000 #self.ui.hot_list.setEditTriggers(QAbstractItemView.NoEditTriggers) def get_cookies(self): if len(threading.enumerate()) >= 2: QMessageBox.about(self.ui, '警告', '正在下载!!' ) return self.ui.cookies_edit.clear() self.ui.cookies_edit.paste() self.cookies = self.ui.cookies_edit.toPlainText() def find_hot(self): if len(threading.enumerate()) >= 2: QMessageBox.about(self.ui, '警告', '正在下载!!' ) return if self.cookies == 0000: QMessageBox.about(self.ui, '警告', '你还没有粘贴进去cookies!下载个锤子' ) return rows = self.ui.hot_list.rowCount() for row in range(rows): self.ui.hot_list.removeRow(0) url = 'https://www.zhihu.com/hot' headers = { 'user-agent': '''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36''', 'cookie': self.cookies } try: response = requests.get(url, headers=headers) except: self.ui.hot_list.insertRow(0) item = QTableWidgetItem('错误!') self.ui.hot_list.setItem(0, 0, item) item = QTableWidgetItem('cookies可能有误,无误的话就重启软件试试~') self.ui.hot_list.setItem(0, 1, item) return html = response.text html = etree.HTML(html) url_list = html.xpath('''//div[@class='HotList-list']/section/div[@class="HotItem-content"]/a/@href''') title_list = html.xpath('''//div[@class='HotList-list']/section/div[@class="HotItem-content"]/a/@title''') hot_nums = html.xpath('''//div[@class='HotList-list']/section/div[@class="HotItem-content"]/div/text()''') data = { 'url': url_list, 'title': title_list, 'hot': hot_nums } df = pd.DataFrame(data) try: df1 = pd.DataFrame(list(df['url'].str.split('/'))) except AttributeError: self.ui.hot_list.insertRow(0) item = QTableWidgetItem('错误!') self.ui.hot_list.setItem(0, 0, item) item = QTableWidgetItem('你可拉倒吧,cookies明显有误!') self.ui.hot_list.setItem(0, 1, item) return df = df.join(df1[[3, 4]]) df = df.rename(columns={3: 'isque', 4: 'pid'}) df = df[df['isque'] == 'question'] for i in range(len(df)): dftemp = df.loc row = self.ui.hot_list.rowCount() self.ui.hot_list.insertRow(row) item = QTableWidgetItem(str(dftemp['hot'])) self.ui.hot_list.setItem(row, 0, item) item = QTableWidgetItem(str(dftemp['title'])) self.ui.hot_list.setItem(row, 1, item) self.df = df def save2(self): if len(threading.enumerate()) >= 2: QMessageBox.about(self.ui, '警告', '正在下载!!' ) return print(type(self.df)) try: if self.df == 0000: QMessageBox.about(self.ui, '警告', '你还没有获取,保存个锤子', ) return except ValueError: pass df = self.df filePath = QFileDialog.getExistingDirectory(self.ui, "选择存储路径") if filePath == '': return def get_comments(df,cookies): pid = df['pid'] comm_list = [] print(f'正在爬pid={pid}') for i in range(1, 100, 20): print(f'其中第{i}个') url = f'https://www.zhihu.com/api/v4/questions/{pid}/answers?include=data%5B%2A%5D.is_normal%2CADmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics&limit=20&offset={i}&platform=desktop&sort_by=default' headers = { 'user-agent': '''Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36''', 'cookie': cookies } res = requests.get(url, headers=headers) res.encoding = 'utf-8' html = res.text data = json.loads(html)['data'] if data == []: break df2 = pd.DataFrame(data) df3 = pd.DataFrame(list(df2['author'])) df3 = df3[['name', 'url_token', 'user_type', 'headline']] df4 = df3.merge(df2[['content', 'voteup_count', 'comment_count']], left_index=True, right_index=True) comm_list.append(df4) a = comm_list[0] for j in comm_list[1:]: a = a.append(j, ignore_index=True) return a def thr(df,ui,cookies): hot_list = [] for i in df.index: ui.download_edit.append(f'正在爬第{i + 1}个') ui.download_edit.ensureCursorVisible() comments = get_comments(df.loc,cookies) hot_list.append(comments) #break ui.download_edit.append('终于下载完了,正在保存') ui.download_edit.ensureCursorVisible() wb = Workbook() ws = wb.active for row in dataframe_to_rows(df[['url', 'title', 'hot']], index=False, header=True): ws.append(row) count = 1 for df0 in hot_list: title = f'排行第{count}' wb.create_sheet(title=title) sheet = wb[title] for r in dataframe_to_rows(df0, index=False, header=True): sheet.append(r) count += 1 file_time = str(datetime.datetime.now())[:-7] file_time = file_time.replace(":", '-') file_name = f"知乎热评{file_time}.xlsx" wb.save(filePath + '\\' + file_name) # QMessageBox.about(ui, # '通知', # '久等了,终于下载完成了' # ) t1 = threading.Thread(target=thr,args=[df,self.ui,self.cookies]) t1.start()app = QApplication([])app.setWindowIcon(QIcon('0.png'))d = Download()d.ui.show()app.exec_()os._exit(0)
说实话我不太喜欢公布自己的源码,虽然我菜,我也不想被抄。而且说实话写这点代码也费了不少脑子和好多精力。(主要是灵感一来就想爆肝)
不过嘛,不放源码怕你们不放心。。
爬虫是单线程下载,所以没那么快。多线程太麻烦了,还要排序啥的。。。不过也不算慢,五分钟之内吧,没计时也。
其中找自己cookies应该大家都会吧?F12找嘛。
这软件68M,脚本10K。我也是醉了,一开始280M,到68m我已经是没用办法再抵抗了。。。请见谅哈~
希望大家给个回复支持啦,谢谢啦。。。
刚才忘记放文件了:
|
|