python爬虫实战-利用xpath抓取小说
本代码仅限于参考学习,新手一枚,技术仅限于此
开发环境
windows
python3.7
requests (2.19.1)
lxml (4.2.5)
redis (3.0.1)
OK,直入主题
页面解析
def get_html(url):
try:
header = {
'Referer':url,
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
}
resp = requests.get(url,headers=header)
resp = resp.text.encode('iso-8859-1').decode('gbk')
return resp
except Exception as e:
tkinter.messagebox.showwarning("页面解析异常", e)
mainloop()
获取页面信息
def get_html_info(html):
try:
html_info = etree.HTML(html)
article_title = html_info.xpath('//a[contains(@class,"article_title")]/text()')[0]# 书名
author = html_info.xpath('//span[contains(@class,"r")]/text()')[0]# 作者
text_url = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/@href')# url列表
chapter_titile = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/text()')# 章节列表
result_info_list = []
result_info_dict = {}
for index in range(len(text_url)):
result_dict = {
'chapter_titile':chapter_titile[index],
'text_url':text_url[index]
}
result_info_list.append(result_dict)
result_info_dict['article_title'] = article_title
result_info_dict['author'] = author
result_info_dict['chapter_text'] = result_info_list
return result_info_dict
except Exception as e:
tkinter.messagebox.showwarning('获取页面信息异常',e)
mainloop()
获取小说正文并写入本地
def get_text(html):
try:
result_info_dict = get_html_info(html)
article_title = result_info_dict['article_title']
author = result_info_dict['author']
chapter_text_list = result_info_dict['chapter_text']
text_list = []
for chapter_text in chapter_text_list:
chapter_titile = chapter_text['chapter_titile']
text_url = chapter_text['text_url']
header = {
'Referer': text_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
}
resp = requests.get(text_url, headers=header)
resp = resp.text.encode('iso-8859-1').decode('gbk')
text_info = etree.HTML(resp)
if text_info != None:
text_info = text_info.xpath('//div[contains(@id,"content")]/text()')
text_info = text_info
text_dict = {}
context = ''
for text in text_info:
context += text
context = context.replace('\r','').replace('\n','').replace('\xa0','')
text_dict['chapter_titile'] = chapter_titile
text_dict['context'] = context
# redis_conn.hset(article_title,chapter_titile,context)#存放至redis
print('正在下载%s'%chapter_titile)
text_list.append(text_dict)
time.sleep(1)
write = open(article_title + '_' + author + '.txt','wb')
print('下载完毕,正在写入本地文件')
for text in text_list:
chapter_titile = text['chapter_titile']
context = text['context']
write.write((chapter_titile + " " + context).encode('utf-8'))
print('成功写入%s'%chapter_titile)
tkinter.messagebox.showinfo('恭喜你!','%s下载完成'% article_title)
mainloop()
write.close()
except Exception as e:
tkinter.messagebox.showwarning('下载或写入异常',e)
mainloop()
完整代码
import time
import tkinter.messagebox
from tkinter import *
import redis
import requests
from lxml import etree
def get_html(url):
try:
header = {
'Referer':url,
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
}
resp = requests.get(url,headers=header)
resp = resp.text.encode('iso-8859-1').decode('gbk')
return resp
except Exception as e:
tkinter.messagebox.showwarning("页面解析异常", e)
mainloop()
def get_html_info(html):
try:
html_info = etree.HTML(html)
article_title = html_info.xpath('//a[contains(@class,"article_title")]/text()')[0]# 书名
author = html_info.xpath('//span[contains(@class,"r")]/text()')[0]# 作者
text_url = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/@href')# url列表
chapter_titile = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/text()')# 章节列表
result_info_list = []
result_info_dict = {}
for index in range(len(text_url)):
result_dict = {
'chapter_titile':chapter_titile[index],
'text_url':text_url[index]
}
result_info_list.append(result_dict)
result_info_dict['article_title'] = article_title
result_info_dict['author'] = author
result_info_dict['chapter_text'] = result_info_list
return result_info_dict
except Exception as e:
tkinter.messagebox.showwarning('获取页面信息异常',e)
mainloop()
def get_text(html):
try:
result_info_dict = get_html_info(html)
article_title = result_info_dict['article_title']
author = result_info_dict['author']
chapter_text_list = result_info_dict['chapter_text']
text_list = []
for chapter_text in chapter_text_list:
chapter_titile = chapter_text['chapter_titile']
text_url = chapter_text['text_url']
header = {
'Referer': text_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
}
resp = requests.get(text_url, headers=header)
resp = resp.text.encode('iso-8859-1').decode('gbk')
text_info = etree.HTML(resp)
if text_info != None:
text_info = text_info.xpath('//div[contains(@id,"content")]/text()')
text_info = text_info
text_dict = {}
context = ''
for text in text_info:
context += text
context = context.replace('\r','').replace('\n','').replace('\xa0','')
text_dict['chapter_titile'] = chapter_titile
text_dict['context'] = context
# redis_conn.hset(article_title,chapter_titile,context)#存放至redis
print('正在下载%s'%chapter_titile)
text_list.append(text_dict)
time.sleep(1)
write = open(article_title + '_' + author + '.txt','wb')
print('下载完毕,正在写入本地文件')
for text in text_list:
chapter_titile = text['chapter_titile']
context = text['context']
write.write((chapter_titile + " " + context).encode('utf-8'))
print('成功写入%s'%chapter_titile)
tkinter.messagebox.showinfo('恭喜你!','%s下载完成'% article_title)
mainloop()
write.close()
except Exception as e:
tkinter.messagebox.showwarning('下载或写入异常',e)
mainloop()
if __name__ == '__main__':
# redis_conn = redis.Redis('127.0.0.1',6379,db=1)
# 仅限全书网可用
# http://www.quanshuwang.com/book/12/12645 章节目录的URL
url = input('请输入全书网章节目录的URL:')
# url = 'http://www.quanshuwang.com/book/10/10732'
html = get_html(url=url)
get_text(html)
代码中使用到了python的消息提示框,运行时会有一些提示框出现