【python】爬虫练习1——小说
【代码】【python】爬虫练习1——小说。
·
import requests #需要 pip install request先下载
import re
from bs4 import BeautifulSoup #需要pip install bs4 或者 pip3 install bs4先下载
# url = "https://www.XXX&chapterid=1"
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"#登录网页-右键“检查"-刷新后查看Network中的User_agent
cookie = "balabala"#登录网页-右键“检查"-刷新后查看Network中的Cookie
headers = {
"User-Agent": user_agent,
"Cookie": cookie
}
for chapter in range(1,2,1):#range区间左闭右开,步长为1
content = requests.get(f"https://www.XXX&chapterid={chapter}", headers=headers)
# print("改前编码:" + content.encoding)
content.encoding = "GBK"
# print("改后:" + content.encoding)
html = content.text
soup = BeautifulSoup(html, "html.parser")
title = soup.find("h2")
txt = soup.find("div",attrs={"onselectstart":"return false"})
with open("tanxulin.txt", "a", encoding="GBK") as f:
f.write(f"第 {chapter} 章 {title.string}")
for txt_content in txt:
# txt_content = re.compile(u'[\u4e00-\u9fa5]')
readling = txt_content.string
if readling != None:
f.write(readling + '\n')
# print(readling)
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)