# -*- coding: utf8 -*-
'''
Created on 2013-12-19
@author: good-temper
'''
import
urllib2
import
urllib
import
cookielib
import
re
import
bs4
URL_BAIDU_INDEX
=
u
'http://www.baidu.com/'
;
#https://passport.baidu.com/v2/api/?getapi&class=login&tpl=mn&tangram=true 也可以用這個
URL_BAIDU_TOKEN
=
'https://passport.baidu.com/v2/api/?getapi&tpl=pp&apiver=v3&class=login'
;
URL_BAIDU_LOGIN
=
'https://passport.baidu.com/v2/api/?login'
;
#設(shè)置用戶名、密碼
username
=
'';
password
=
'';
#設(shè)置cookie,這里cookiejar可自動管理,無需手動指定
cj
=
cookielib.CookieJar();
opener
=
urllib2.build_opener(urllib2.HTTPCookieProcessor(cj));
urllib2.install_opener(opener);
reqReturn
=
urllib2.urlopen(URL_BAIDU_INDEX);
#獲取token,
tokenReturn
=
urllib2.urlopen(URL_BAIDU_TOKEN);
matchVal
=
re.search(u
'"token" : "(?P<tokenVal>.*?)"'
,tokenReturn.read());
tokenVal
=
matchVal.group(
'tokenVal'
);
#構(gòu)造登錄請求參數(shù),該請求數(shù)據(jù)是通過抓包獲得,對應(yīng)https://passport.baidu.com/v2/api/?login請求
postData
=
{
'username'
: username,
'password'
: password,
'u'
:
'https://passport.baidu.com/'
,
'tpl'
:
'pp'
,
'token'
: tokenVal,
'staticpage'
:
'https://passport.baidu.com/static/passpc-account/html/v3Jump.html'
,
'isPhone'
:
'false'
,
'charset'
:
'UTF-8'
,
'callback'
:
'parent.bd__pcbs__ra48vi'
};
postData
=
urllib.urlencode(postData);
#發(fā)送登錄請求
loginRequest
=
urllib2.Request(URL_BAIDU_LOGIN,postData);
loginRequest.add_header(
'Accept'
,
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
);
loginRequest.add_header(
'Accept-Encoding'
,
'gzip,deflate,sdch'
);
loginRequest.add_header(
'Accept-Language'
,
'zh-CN,zh;q=0.8'
);
loginRequest.add_header(
'User-Agent'
,
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36'
);
loginRequest.add_header(
'Content-Type'
,
'application/x-www-form-urlencoded'
);
sendPost
=
urllib2.urlopen(loginRequest);
#查看貼吧個人主頁 ,測試是否登陸成功,由于cookie自動管理,這里處理起來方便很多
#http://tieba.baidu.com/home/main?un=XXXX&fr=index 這個是貼吧個人主頁,各項信息都可以在此找到鏈接
teibaUrl
=
'http://tieba.baidu.com/f/like/mylike?v=1387441831248'
content
=
urllib2.urlopen(teibaUrl).read();
content
=
content.decode(
'gbk'
).encode(
'utf8'
);
print
content;
#解析數(shù)據(jù),用的BeautifulSoup4,感覺沒有jsoup用的爽
soup
=
bs4.BeautifulSoup(content);
list
=
soup.findAll(
'tr'
);
list
=
list
[
1
:
len
(
list
)];
careTeibalist
=
[];
print
'貼吧鏈接\t吧名\t等級'
;
for
elem
in
list
:
soup1
=
bs4.BeautifulSoup(
str
(elem));
print
'http://tieba.baidu.com/'
+
soup1.find(
'a'
)[
'href'
]
+
'\t'
+
soup1.find(
'a'
)[
'title'
]
+
'\t'
+
soup1.find(
'a'
,{
'class'
,
'like_badge'
})[
'title'
];
聯(lián)系客服