写在前面

直接requests.get(),然后解析,得到每章标题和内容。
可能是我这边网络不好,或者是这个网站服务器不怎么好,有时会请求超时,甚至我已经把timeout设置为100,还是会请求失败,所以有一些章没爬下来。不过那些章我都又一个一个爬取,然后添加进去,甚是辛苦啊。

代码

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time:2018/10/3 22:00
# @Author: wardseptember
# @File: jinRong.py

import requests
from bs4 import BeautifulSoup
import time

#获取每本书的章节内容
def get_chapter(url):
    # 获取网页的源代码
    try:
        html=requests.get(url,timeout=100)
        html.encoding = html.apparent_encoding
        content=html.content
        # 将网页源代码解析成HTML格式
        soup = BeautifulSoup(content, "lxml")
        title = soup.find('h1').text    #获取章节的标题
        text = soup.find('div', id='htmlContent')    #获取章节的内容
        #处理章节的内容,使得格式更加整洁、清晰
        content = text.get_text('\n','br/').replace('\n', '\n    ')
        content = content.replace('  ', '\n  ')
        return title, '    '+content
    except Exception as e:
        print(str(e))
        return "超时,未下载成功"


def main():
    # 书本列表
    books = ['射雕英雄传','天龙八部','鹿鼎记','神雕侠侣','笑傲江湖','碧血剑','倚天屠龙记',\
             '飞狐外传','书剑恩仇录','连城诀','侠客行','越女剑','鸳鸯刀','白马啸西风',\
             '雪山飞狐']
    order = [1,2,3,4,5,6,7,8,10,11,12,14,15,13,9]  #order of books to scrapy
    #list to store each book's scrapying range
    page_range = [1,43,94,145,185,225,248,289,309,329,341,362,363,364,374,385]

    for i,book in enumerate(books):
        with open('D://jinyong//%s.txt'%book, 'w', encoding='gb18030') as f1:
            f1.close()
            pass
        for num in range(page_range[i],page_range[i+1]):
        #for num in range(185,225):
            url = "http://jinyong.zuopinj.com/%s/%s.html"%(order[i],num)
            # 错误处理机制
            try:
                title, chapter = get_chapter(url)
                time.sleep(2)
                with open('D://jinyong//%s.txt'%book, 'a', encoding='gb18030') as f:
                    print(book+':'+title+'-->写入成功!')
                    f.write(title+'\n\n\n')
                    f.write(chapter+'\n\n\n')
                    f.close()
            except Exception as e:
                print(str(e))
    print('全部写入完毕!')

main()

单个爬取的代码,改下url就行了

#!/usr/bin/python3
# -*- coding:utf-8 -*-
# @Time:2018/10/3 22:00
# @Author: wardseptember
# @File: jinRong.py


import requests
from bs4 import BeautifulSoup
import time

#获取每本书的章节内容
def get_chapter(url):
    # 获取网页的源代码
    try:
        html=requests.get(url,timeout=30)
        html.encoding = html.apparent_encoding
        content=html.content
        # 将网页源代码解析成HTML格式
        soup = BeautifulSoup(content, "lxml")
        title = soup.find('h1').text    #获取章节的标题
        text = soup.find('div', id='htmlContent')    #获取章节的内容
        #处理章节的内容,使得格式更加整洁、清晰
        content = text.get_text('\n','br/').replace('\n', '\n    ')
        content = content.replace('  ', '\n  ')
        return title, '    '+content
    except Exception as e:
        print(str(e))
        return "超时,未下载成功"


def main():
	#更改这个url,想爬哪本就爬哪本
    url = "http://jinyong.zuopinj.com/9/374.html"
    # 错误处理机制
    try:
        title, chapter = get_chapter(url)
        time.sleep(2)
        with open('D://jinyong//补充缺失章.txt', 'w', encoding='gb18030') as f:
            print(title+'-->写入成功!')
            f.write(title+'\n\n\n')
            f.write(chapter+'\n\n\n')
            f.close()
    except Exception as e:
        print(str(e))
    print('全部写入完毕!')

main()

百度网盘直接下载爬取好的金庸全集

链接:https://pan.baidu.com/s/1DwRkfoWCyTpS5NZqgkuiyg
提取码:zdym

在这里插入图片描述

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐