python爬取小说

shao84314 · 发表于 2020-3-21 17:45:22

环境：Python3.6+Windows开发工具：你喜欢用哪个就用哪个，你开心就好！
主要思路：1 获取主页源代码2 获取章节超链接3 获取章节超链接源码4 获取小说内容5 下载,文件操作
源码：import urllib.requestimport re# 1 获取主页源代码# 2 获取章节超链接# 3 获取章节超链接源码# 4 获取小说内容# 5 下载,文件操作
# 驼峰命名法# 获取小说内容def getNovertContent(): # html = urllib.request.urlopen("http://www.quanshuwang.com/book/0/269").read() html = html.decode("gbk") # 不加括号不匹配 # 正则表达式 .*? 匹配所有 reg = r'

[url=](.*?)[/url]' # 增加效率的 reg = re.compile(reg) urls = re.findall(reg,html) # print(urls) # 列表 # [(http://www.quanshuwang.com/book/0/269/78850.html,第一章山边小村), # (http://www.quanshuwang.com/book/0/269/78854.html,第二章青牛镇)] for url in urls:       # 章节的URL地址       novel_url = url[0]       # 章节标题       novel_title = url[1]
      chapt = urllib.request.urlopen(novel_url).read()       chapt_html = chapt.decode("gbk")       # r 表示原生字符串 \ \\d  r"\d"       reg = r' (.*?)'       # S 代表多行匹配       reg = re.compile(reg,re.S)       chapt_content = re.findall(reg,chapt_html)       # print(chapt_content)       # 列表[" &nbsp二愣子睁大着双眼，直直望着茅草和烂泥糊成的
"]
      # 第一个参数要替换的字符串替换后的字符串       chapt_content = chapt_content[0].replace(" ","")       # print(chapt_content) 字符串  二愣子睁大着双眼，直直望着茅草和烂泥糊成的
      chapt_content = chapt_content.replace("
","")
      print("正在保存 %s"%novel_title)       # w 读写模式  wb       # f = open("{}.txt".format(novel_title),'w')       # f.write(chapt_content)
      with open("{}.txt".format(novel_title),'w') as f:          f.write(chapt_content)
      # f.close()
getNovertContent()

[视频教程] python爬取小说

个人中心