利用python实现新浪微博爬虫 .
新版新浪微博模拟登陆请看:http://blog.csdn.net/monsion/article/details/8656690本文后面的解决动态加载的程序依然有效重新编辑了一次,出了点儿问题第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:[python]view plaincopyprint?#! /usr/
新版新浪微博模拟登陆请看:http://blog.csdn.net/monsion/article/details/8656690
本文后面的解决动态加载的程序依然有效
重新编辑了一次,出了点儿问题
第一个模块,模拟登陆sina微博,创建weiboLogin.py文件,输入以下代码:
- #! /usr/bin/env python
- # -*- coding: utf-8 -*-
- import sys
- import urllib
- import urllib2
- import cookielib
- import base64
- import re
- import json
- import hashlib
- class weiboLogin:
- cj = cookielib.LWPCookieJar()
- cookie_support = urllib2.HTTPCookieProcessor(cj)
- opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
- urllib2.install_opener(opener)
- postdata = {
- 'entry': 'weibo',
- 'gateway': '1',
- 'from': '',
- 'savestate': '7',
- 'userticket': '1',
- 'ssosimplelogin': '1',
- 'vsnf': '1',
- 'vsnval': '',
- 'su': '',
- 'service': 'miniblog',
- 'servertime': '',
- 'nonce': '',
- 'pwencode': 'wsse',
- 'sp': '',
- 'encoding': 'UTF-8',
- 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
- 'returntype': 'META'
- }
- def get_servertime(self):
- url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939'
- data = urllib2.urlopen(url).read()
- p = re.compile('\((.*)\)')
- try:
- json_data = p.search(data).group(1)
- data = json.loads(json_data)
- servertime = str(data['servertime'])
- nonce = data['nonce']
- return servertime, nonce
- except:
- print 'Get severtime error!'
- return None
- def get_pwd(self, pwd, servertime, nonce):
- pwd1 = hashlib.sha1(pwd).hexdigest()
- pwd2 = hashlib.sha1(pwd1).hexdigest()
- pwd3_ = pwd2 + servertime + nonce
- pwd3 = hashlib.sha1(pwd3_).hexdigest()
- return pwd3
- def get_user(self, username):
- username_ = urllib.quote(username)
- username = base64.encodestring(username_)[:-1]
- return username
- def login(self,username,pwd):
- url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)'
- try:
- servertime, nonce = self.get_servertime()
- except:
- print 'get servertime error!'
- return
- weiboLogin.postdata['servertime'] = servertime
- weiboLogin.postdata['nonce'] = nonce
- weiboLogin.postdata['su'] = self.get_user(username)
- weiboLogin.postdata['sp'] = self.get_pwd(pwd, servertime, nonce)
- weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)
- headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11'}
- req = urllib2.Request(
- url = url,
- data = weiboLogin.postdata,
- headers = headers
- )
- result = urllib2.urlopen(req)
- text = result.read()
- p = re.compile('location\.replace\(\'(.*?)\'\)')
- try:
- login_url = p.search(text).group(1)
- urllib2.urlopen(login_url)
- print "Login success!"
- except:
- print 'Login error!'
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import urllib
import urllib2
import cookielib
import base64
import re
import json
import hashlib
class weiboLogin:
cj = cookielib.LWPCookieJar()
cookie_support = urllib2.HTTPCookieProcessor(cj)
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'userticket': '1',
'ssosimplelogin': '1',
'vsnf': '1',
'vsnval': '',
'su': '',
'service': 'miniblog',
'servertime': '',
'nonce': '',
'pwencode': 'wsse',
'sp': '',
'encoding': 'UTF-8',
'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
'returntype': 'META'
}
def get_servertime(self):
url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939'
data = urllib2.urlopen(url).read()
p = re.compile('\((.*)\)')
try:
json_data = p.search(data).group(1)
data = json.loads(json_data)
servertime = str(data['servertime'])
nonce = data['nonce']
return servertime, nonce
except:
print 'Get severtime error!'
return None
def get_pwd(self, pwd, servertime, nonce):
pwd1 = hashlib.sha1(pwd).hexdigest()
pwd2 = hashlib.sha1(pwd1).hexdigest()
pwd3_ = pwd2 + servertime + nonce
pwd3 = hashlib.sha1(pwd3_).hexdigest()
return pwd3
def get_user(self, username):
username_ = urllib.quote(username)
username = base64.encodestring(username_)[:-1]
return username
def login(self,username,pwd):
url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.3.18)'
try:
servertime, nonce = self.get_servertime()
except:
print 'get servertime error!'
return
weiboLogin.postdata['servertime'] = servertime
weiboLogin.postdata['nonce'] = nonce
weiboLogin.postdata['su'] = self.get_user(username)
weiboLogin.postdata['sp'] = self.get_pwd(pwd, servertime, nonce)
weiboLogin.postdata = urllib.urlencode(weiboLogin.postdata)
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11'}
req = urllib2.Request(
url = url,
data = weiboLogin.postdata,
headers = headers
)
result = urllib2.urlopen(req)
text = result.read()
p = re.compile('location\.replace\(\'(.*?)\'\)')
try:
login_url = p.search(text).group(1)
urllib2.urlopen(login_url)
print "Login success!"
except:
print 'Login error!'
然后创建main.py文件,输入以下代码:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import weiboLogin
- import urllib
- import urllib2
- username = '你的微博用户名'
- pwd = '你的微博密码'
- WBLogin = weiboLogin.weiboLogin()
- WBLogin.login(username, pwd)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import weiboLogin
import urllib
import urllib2
username = '你的微博用户名'
pwd = '你的微博密码'
WBLogin = weiboLogin.weiboLogin()
WBLogin.login(username, pwd)
注意:若登陆失败,可能是你的账号在登陆的时候需要输入验证码!你在网页上登陆你的账号试试看,在账号设置里面可以设置某些地区不输入验证码。
参考:http://www.douban.com/note/201767245/
接下来,考虑实现抓取微博的内容。
此时遇到一个困难,当抓取指定URL的微博时,初始显示只有15条。后面的是延迟显示的(ajax里面叫lazy load?)。也就是说,当滚动条第一次拖到最下面的时候,会显示第二部分,再拖到最下面,会显示第三部分。此时一个页面的微博才是完整的。所以,要获取一个微博页面的全部微博,需要访问这个页面三次。创建getWeiboPage.py文件,相应代码如下:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import urllib
- import urllib2
- import sys
- import time
- reload(sys)
- sys.setdefaultencoding('utf-8')
- class getWeiboPage:
- body = {
- '__rnd':'',
- '_k':'',
- '_t':'0',
- 'count':'50',
- 'end_id':'',
- 'max_id':'',
- 'page':1,
- 'pagebar':'',
- 'pre_page':'0',
- 'uid':''
- }
- uid_list = []
- charset = 'utf8'
- def get_msg(self,uid):
- getWeiboPage.body['uid'] = uid
- url = self.get_url(uid)
- self.get_firstpage(url)
- self.get_secondpage(url)
- self.get_thirdpage(url)
- def get_firstpage(self,url):
- getWeiboPage.body['pre_page'] = getWeiboPage.body['page']-1
- url = url +urllib.urlencode(getWeiboPage.body)
- req = urllib2.Request(url)
- result = urllib2.urlopen(req)
- text = result.read()
- self.writefile('./output/text1',text)
- self.writefile('./output/result1',eval("u'''"+text+"'''"))
- def get_secondpage(self,url):
- getWeiboPage.body['count'] = '15'
- # getWeiboPage.body['end_id'] = '3490160379905732'
- # getWeiboPage.body['max_id'] = '3487344294660278'
- getWeiboPage.body['pagebar'] = '0'
- getWeiboPage.body['pre_page'] = getWeiboPage.body['page']
- url = url +urllib.urlencode(getWeiboPage.body)
- req = urllib2.Request(url)
- result = urllib2.urlopen(req)
- text = result.read()
- self.writefile('./output/text2',text)
- self.writefile('./output/result2',eval("u'''"+text+"'''"))
- def get_thirdpage(self,url):
- getWeiboPage.body['count'] = '15'
- getWeiboPage.body['pagebar'] = '1'
- getWeiboPage.body['pre_page'] = getWeiboPage.body['page']
- url = url +urllib.urlencode(getWeiboPage.body)
- req = urllib2.Request(url)
- result = urllib2.urlopen(req)
- text = result.read()
- self.writefile('./output/text3',text)
- self.writefile('./output/result3',eval("u'''"+text+"'''"))
- def get_url(self,uid):
- url = 'http://weibo.com/' + uid + '?from=otherprofile&wvr=3.6&loc=tagweibo'
- return url
- def get_uid(self,filename):
- fread = file(filename)
- for line in fread:
- getWeiboPage.uid_list.append(line)
- print line
- time.sleep(1)
- def writefile(self,filename,content):
- fw = file(filename,'w')
- fw.write(content)
- fw.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib
import urllib2
import sys
import time
reload(sys)
sys.setdefaultencoding('utf-8')
class getWeiboPage:
body = {
'__rnd':'',
'_k':'',
'_t':'0',
'count':'50',
'end_id':'',
'max_id':'',
'page':1,
'pagebar':'',
'pre_page':'0',
'uid':''
}
uid_list = []
charset = 'utf8'
def get_msg(self,uid):
getWeiboPage.body['uid'] = uid
url = self.get_url(uid)
self.get_firstpage(url)
self.get_secondpage(url)
self.get_thirdpage(url)
def get_firstpage(self,url):
getWeiboPage.body['pre_page'] = getWeiboPage.body['page']-1
url = url +urllib.urlencode(getWeiboPage.body)
req = urllib2.Request(url)
result = urllib2.urlopen(req)
text = result.read()
self.writefile('./output/text1',text)
self.writefile('./output/result1',eval("u'''"+text+"'''"))
def get_secondpage(self,url):
getWeiboPage.body['count'] = '15'
# getWeiboPage.body['end_id'] = '3490160379905732'
# getWeiboPage.body['max_id'] = '3487344294660278'
getWeiboPage.body['pagebar'] = '0'
getWeiboPage.body['pre_page'] = getWeiboPage.body['page']
url = url +urllib.urlencode(getWeiboPage.body)
req = urllib2.Request(url)
result = urllib2.urlopen(req)
text = result.read()
self.writefile('./output/text2',text)
self.writefile('./output/result2',eval("u'''"+text+"'''"))
def get_thirdpage(self,url):
getWeiboPage.body['count'] = '15'
getWeiboPage.body['pagebar'] = '1'
getWeiboPage.body['pre_page'] = getWeiboPage.body['page']
url = url +urllib.urlencode(getWeiboPage.body)
req = urllib2.Request(url)
result = urllib2.urlopen(req)
text = result.read()
self.writefile('./output/text3',text)
self.writefile('./output/result3',eval("u'''"+text+"'''"))
def get_url(self,uid):
url = 'http://weibo.com/' + uid + '?from=otherprofile&wvr=3.6&loc=tagweibo'
return url
def get_uid(self,filename):
fread = file(filename)
for line in fread:
getWeiboPage.uid_list.append(line)
print line
time.sleep(1)
def writefile(self,filename,content):
fw = file(filename,'w')
fw.write(content)
fw.close()
在刚刚的main.py中加入相应内容,完整内容为:
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import weiboLogin
- import getWeiboMsg
- import urllib
- import urllib2
- username = '你的微博用户名'
- pwd = '你的微博密码'
- WBLogin = weiboLogin.weiboLogin()
- WBLogin.login(username, pwd)
- WBmsg = getWeiboMsg.getWeiboMsg()
- url = 'http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo'
- WBmsg.get_firstpage(url)
- WBmsg.get_secondpage(url)
- WBmsg.get_thirdpage(url)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import weiboLogin
import getWeiboMsg
import urllib
import urllib2
username = '你的微博用户名'
pwd = '你的微博密码'
WBLogin = weiboLogin.weiboLogin()
WBLogin.login(username, pwd)
WBmsg = getWeiboMsg.getWeiboMsg()
url = 'http://weibo.com/1624087025?from=otherprofile&wvr=3.6&loc=tagweibo'
WBmsg.get_firstpage(url)
WBmsg.get_secondpage(url)
WBmsg.get_thirdpage(url)
参考:http://www.cnblogs.com/sickboy/archive/2012/01/08/2316248.html
执行python main.py,应该可以运行,结果保存在./output/文件夹中,该文件夹自己提前创建好。
昨天搞了一个下午,很多东西还没弄好,欢迎留言交流。
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐



所有评论(0)