python爬虫实战-利用xpath抓取小说

本代码仅限于参考学习，新手一枚，技术仅限于此
开发环境
windows
python3.7
requests (2.19.1)
lxml (4.2.5)
redis (3.0.1)

OK,直入主题

页面解析

def get_html(url):
    try:
        header = {
            'Referer':url,
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
        }
        resp = requests.get(url,headers=header)
        resp = resp.text.encode('iso-8859-1').decode('gbk')

        return resp
    except Exception as e:
        tkinter.messagebox.showwarning("页面解析异常", e)
        mainloop()

获取页面信息

def get_html_info(html):
    try:
        html_info = etree.HTML(html)
        article_title = html_info.xpath('//a[contains(@class,"article_title")]/text()')[0]# 书名
        author = html_info.xpath('//span[contains(@class,"r")]/text()')[0]# 作者
        text_url = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/@href')# url列表
        chapter_titile = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/text()')# 章节列表

        result_info_list = []
        result_info_dict = {}
        for index in range(len(text_url)):

            result_dict = {
                'chapter_titile':chapter_titile[index],
                'text_url':text_url[index]
            }
            result_info_list.append(result_dict)

        result_info_dict['article_title'] = article_title
        result_info_dict['author'] = author
        result_info_dict['chapter_text'] = result_info_list

        return result_info_dict
    except Exception as e:
        tkinter.messagebox.showwarning('获取页面信息异常',e)
        mainloop()

获取小说正文并写入本地

def get_text(html):
    try:
        result_info_dict = get_html_info(html)
        article_title = result_info_dict['article_title']
        author = result_info_dict['author']
        chapter_text_list = result_info_dict['chapter_text']

        text_list = []
        for chapter_text in chapter_text_list:
            chapter_titile = chapter_text['chapter_titile']
            text_url = chapter_text['text_url']

            header = {
                'Referer': text_url,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
            }
            resp = requests.get(text_url, headers=header)
            resp = resp.text.encode('iso-8859-1').decode('gbk')

            text_info = etree.HTML(resp)

            if text_info != None:
                text_info = text_info.xpath('//div[contains(@id,"content")]/text()')
                text_info = text_info

                text_dict = {}
                context = ''
                for text in text_info:
                    context += text

                context = context.replace('\r','').replace('\n','').replace('\xa0','')
                text_dict['chapter_titile'] = chapter_titile
                text_dict['context'] = context

                # redis_conn.hset(article_title,chapter_titile,context)#存放至redis
                print('正在下载%s'%chapter_titile)
                text_list.append(text_dict)
                time.sleep(1)

        write = open(article_title + '_' + author + '.txt','wb')
        print('下载完毕，正在写入本地文件')

        for text in text_list:
            chapter_titile = text['chapter_titile']
            context = text['context']
            write.write((chapter_titile + " " + context).encode('utf-8'))
            print('成功写入%s'%chapter_titile)

        tkinter.messagebox.showinfo('恭喜你！','%s下载完成'% article_title)
        mainloop()
        write.close()
    except Exception as e:
        tkinter.messagebox.showwarning('下载或写入异常',e)
        mainloop()

完整代码


import time
import tkinter.messagebox
from tkinter import *
import redis
import requests
from lxml import etree

def get_html(url):
    try:
        header = {
            'Referer':url,
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
        }
        resp = requests.get(url,headers=header)
        resp = resp.text.encode('iso-8859-1').decode('gbk')

        return resp
    except Exception as e:
        tkinter.messagebox.showwarning("页面解析异常", e)
        mainloop()

def get_html_info(html):
    try:
        html_info = etree.HTML(html)
        article_title = html_info.xpath('//a[contains(@class,"article_title")]/text()')[0]# 书名
        author = html_info.xpath('//span[contains(@class,"r")]/text()')[0]# 作者
        text_url = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/@href')# url列表
        chapter_titile = html_info.xpath('//div[contains(@class,"clearfix")]/li/a/text()')# 章节列表

        result_info_list = []
        result_info_dict = {}
        for index in range(len(text_url)):

            result_dict = {
                'chapter_titile':chapter_titile[index],
                'text_url':text_url[index]
            }
            result_info_list.append(result_dict)

        result_info_dict['article_title'] = article_title
        result_info_dict['author'] = author
        result_info_dict['chapter_text'] = result_info_list

        return result_info_dict
    except Exception as e:
        tkinter.messagebox.showwarning('获取页面信息异常',e)
        mainloop()

def get_text(html):
    try:
        result_info_dict = get_html_info(html)
        article_title = result_info_dict['article_title']
        author = result_info_dict['author']
        chapter_text_list = result_info_dict['chapter_text']

        text_list = []
        for chapter_text in chapter_text_list:
            chapter_titile = chapter_text['chapter_titile']
            text_url = chapter_text['text_url']

            header = {
                'Referer': text_url,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400'
            }
            resp = requests.get(text_url, headers=header)
            resp = resp.text.encode('iso-8859-1').decode('gbk')

            text_info = etree.HTML(resp)

            if text_info != None:
                text_info = text_info.xpath('//div[contains(@id,"content")]/text()')
                text_info = text_info

                text_dict = {}
                context = ''
                for text in text_info:
                    context += text

                context = context.replace('\r','').replace('\n','').replace('\xa0','')
                text_dict['chapter_titile'] = chapter_titile
                text_dict['context'] = context

                # redis_conn.hset(article_title,chapter_titile,context)#存放至redis
                print('正在下载%s'%chapter_titile)
                text_list.append(text_dict)
                time.sleep(1)

        write = open(article_title + '_' + author + '.txt','wb')
        print('下载完毕，正在写入本地文件')

        for text in text_list:
            chapter_titile = text['chapter_titile']
            context = text['context']
            write.write((chapter_titile + " " + context).encode('utf-8'))
            print('成功写入%s'%chapter_titile)

        tkinter.messagebox.showinfo('恭喜你！','%s下载完成'% article_title)
        mainloop()
        write.close()
    except Exception as e:
        tkinter.messagebox.showwarning('下载或写入异常',e)
        mainloop()


if __name__ == '__main__':

    # redis_conn = redis.Redis('127.0.0.1',6379,db=1)

    # 仅限全书网可用
    # http://www.quanshuwang.com/book/12/12645 章节目录的URL
    url = input('请输入全书网章节目录的URL:')
    # url = 'http://www.quanshuwang.com/book/10/10732'
    html = get_html(url=url)
    get_text(html)

代码中使用到了python的消息提示框，运行时会有一些提示框出现

打赏

python爬虫实战-利用xpath抓取小说

sizengzhan

发表评论取消回复

sizengzhan

发表评论 取消回复

发表评论取消回复