根据不同编码选择网页解码格式,避免解析时出现乱码, lxml默认解析成utf-8
utf_list = ['utf', 'UTF']gbk_list = ['gbk', 'GBK']try:wbdata = requests.get(url)# proxies=proxies,# print(wbdata.text)soup = BeautifulSoup(wbdata.text, 'lxml')meta = re.findall('&l...
·
utf_list = ['utf', 'UTF']
gbk_list = ['gbk', 'GBK']
try:
wbdata = requests.get(url) # proxies=proxies,
# print(wbdata.text)
soup = BeautifulSoup(wbdata.text, 'lxml')
meta = re.findall('<meta .*?>', wbdata.text, re.S)
meta = str(meta)
# print('meta',meta)
head = meta
# print(head)
# print('soup find head', head)
if 'gb2312' in head:
wbdata.encoding = 'gb2312'
print('-------------->2312')
elif any(name in head for name in utf_list):
wbdata.encoding = 'utf-8'
print('-------------->utf')
# wbdata.decode('gb2312').encode('utf-8')
elif any(name in head for name in gbk_list):
wbdata.encoding = 'gbk'
print('--------------->gbk')
elif '18030' in head:
wbdata.encoding = 'gb18030'
print('status code', wbdata.status_code)
soup = BeautifulSoup(wbdata.text, 'lxml')
title = soup.find('title').text
print('title', title)
except:
print('time out------------------------------------>')

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)