Python
def search_keyword(keyword):
uri = 'https://you.163.com/xhr/search/search.json'
query = {
'keyword': keyword,
'page': 1
}
try:
res = requests.get(uri, params=query).json()
result = res['data']['directly']['searcherResult']['result']
product_id = []
for r in result:
product_id.append(r['id'])
return product_id
except:
raise
我這里是獲取了 page 為 1 的產(chǎn)品 ID,下面就是通過產(chǎn)品 ID 來獲取不同產(chǎn)品下的評論信息。
{
'skuInfo': [
'機型:iphone X',
'顏色:透明白'
],
'frontUserName': 'c****x',
'frontUserAvatar': 'https://yanxuan.nosdn.127.net/c230f2c2a6e7223810755518ce5cd62f',
'content': '手機套很薄,裸機的手感,硅膠的摸著好舒服,看著屏幕都變大了。顏值很高。',
'createTime': 1554982110981,
'picList': [
'https://yanxuan.nosdn.127.net/749cfcb4877be8e7fce449900d763731.jpg',
'https://yanxuan.nosdn.127.net/6744d04efcd7509556b710bc0d9fa6b0.jpg'
],
'commentReplyVO': null,
'memberLevel': 4,
'appendCommentVO': null,
'star': 5,
'itemId': 3444035
}
def details(product_id):
url = 'https://you.163.com/xhr/comment/listByItemByTag.json'
try:
C_list = []
for i in range(1, 100):
query = {
'itemId': product_id,
'page': i,
}
res = requests.get(url, params=query).json()
if not res['data']['commentList']:
break
print('爬取第 %s 頁評論' % i)
commentList = res['data']['commentList']
C_list.append(commentList)
time.sleep(1)
# save to mongoDB
try:
mongo_collection.insert_many(commentList)
except:
continue
return C_list
except:
raise
conn = MongoClient('mongodb://%s:%s@ds149974.mlab.com:49974/you163' % ('you163', 'you163'))
db = conn.you163
mongo_collection = db.iPhone
Boss 直聘網(wǎng)站爬取
from bs4 import BeautifulSoup
import requests
url = 'https://www.zhipin.com/job_detail/?query=python&city=101010100'
res = requests.get(url, headers=header).text
print(res)
content = BeautifulSoup(res, 'html.parser')
ul = content.find_all('ul')
print(ul[12])
Python:可以得到該 job 具體頁面地址
10-15K:每個 job 的薪資
柯萊特集團:招聘公司名稱
北京 朝陽區(qū) 望京|3-5年|學(xué)歷不限:該 job 的詳情信息
job_details_uri = job.find('h3', attrs={'class': 'name'}).find('a')['href']
job_company = job.find('div', attrs={'class': 'company-text'}).find('h3', attrs={'class': 'name'}).find('a').text
job_salary = job.find('h3', attrs={'class': 'name'}).find('span', attrs={'class': 'red'}).text
rege = r'<p>([\u4e00-\u9fa5 ]+)<em class='vline'></em>([\d+-年]+|[\u4e00-\u9fa5]+)<em class='vline'></em>([\u4e00-\u9fa5]+)'
正則表達(dá)式的具體寫法這里就不說了,不熟悉的可以自行查找下。
for job in jobs:
job_dict = {}
job_details_uri = job.find('h3', attrs={'class': 'name'}).find('a')['href']
job_company = job.find('div', attrs={'class': 'company-text'}).find('h3', attrs={'class': 'name'}).find('a').text
job_salary = job.find('h3', attrs={'class': 'name'}).find('span', attrs={'class': 'red'}).text
job_details = str(job.find('p'))
print(job_details)
job_rege = re.match(rege, job_details)
job_dict['name'] = job_company
job_dict['uri'] = job_details_uri
job_dict['salary'] = job_salary
job_dict['site'] = job_rege.group(1)
job_dict['year'] = job_rege.group(2)
job_dict['edu'] = job_rege.group(3)
job_list.append(job_dict)
print(job_list)
https://www.zhipin.com/c101010100/?query=python&page=
c101010100:是城市代碼,在我們這里就代表北京
query:也很明顯,就是我們的搜索關(guān)鍵字
page:頁數(shù)
def jobs(page):
for i in range(1, page + 1):
job_list = []
try:
print('正在抓取第 %s 頁數(shù)據(jù)' % i)
uri = '/c101010100/?query=python&page=%s' % i
res = requests.get(config.url + uri, headers=header).text
content = BeautifulSoup(res, 'html.parser')
ul = content.find_all('ul')
jobs = ul[12].find_all('li')
...
print(job_list)
# save to mongoDB
try:
mongo_collection.insert_many(job_list)
except:
continue
time.sleep(1)
except:
continue
https://www.zhipin.com/job_detail/a8920821a7487a901HJ43tm7EFY~.html
job_conn = MongoClient('mongodb://%s:%s@ds151612.mlab.com:51612/boss' % ('boss', 'boss123'))
job_db = job_conn.boss
job_collection = job_db.boss
details_collection = job_db.job_details
def run_main():
jobs = job_collection.find()
for job in jobs:
print('獲得工作的uri ', job['uri'])
get_details(job)
time.sleep(1)
def get_details(items):
base_url = config.url
url = base_url + items['uri']
company_name = items['name']
try:
res = requests.get(url, headers=header).text
content = BeautifulSoup(res, 'html.parser')
text = content.find('div', attrs={'class': 'text'}).text.strip()
result = {'name': company_name, 'details': text}
details_collection.insert_one(result)
except:
raise
if __name__ == '__main__':
run_main()
job_conn = MongoClient('mongodb://%s:%s@ds151612.mlab.com:51612/boss' % ('boss', 'boss123'))
job_db = job_conn.boss
job_collection = job_db.boss
details_collection = job_db.job_details
s.weibo.com/user?q=林志玲
def get_uid(name):
try:
url = 'https://s.weibo.com/user?q=%s' % name
res = requests.get(url).text
content = BeautifulSoup(res, 'html.parser')
user = content.find('div', attrs={'class': 'card card-user-b s-pg16 s-brt1'})
user_info = user.find('div', attrs={'class': 'info'}).find('div')
href_list = user_info.find_all('a')
if len(href_list) == 3:
title = href_list[1].get('title')
if title == '微博個人認(rèn)證':
uid = href_list[2].get('uid')
return uid
elif title == '微博會員':
uid = href_list[2].get('uid')
return uid
else:
print('There are something wrong')
return False
except:
raise
https://m.weibo.cn/api/container/getIndex?uid=1312412824&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%9E%97%E5%BF%97%E7%8E%B2&containerid=1005051312412824
https://m.weibo.cn/api/container/getIndex?uid=1312412824&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%9E%97%E5%BF%97%E7%8E%B2&containerid=1076031312412824
def get_userinfo(uid):
try:
url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=%s' % uid
res = requests.get(url).json()
containerid = res['data']['tabsInfo']['tabs'][1]['containerid']
mblog_counts = res['data']['userInfo']['statuses_count']
followers_count = res['data']['userInfo']['followers_count']
userinfo = {
'containerid': containerid,
'mblog_counts': mblog_counts,
'followers_count': followers_count
}
return userinfo
except:
raise
def get_blog_info(cards, i, name, page):
blog_dict = {}
if cards[i]['card_type'] == 9:
scheme = cards[i]['scheme'] # 微博地址
mblog = cards[i]['mblog']
mblog_text = mblog['text']
create_time = mblog['created_at']
mblog_id = mblog['id']
reposts_count = mblog['reposts_count'] # 轉(zhuǎn)發(fā)數(shù)量
comments_count = mblog['comments_count'] # 評論數(shù)量
attitudes_count = mblog['attitudes_count'] # 點贊數(shù)量
with open(name, 'a', encoding='utf-8') as f:
f.write('----第' + str(page) + '頁,第' + str(i + 1) + '條微博----' + '\n')
f.write('微博地址:' + str(scheme) + '\n' + '發(fā)布時間:' + str(create_time) + '\n'
+ '微博內(nèi)容:' + mblog_text + '\n' + '點贊數(shù):' + str(attitudes_count) + '\n'
+ '評論數(shù):' + str(comments_count) + '\n' + '轉(zhuǎn)發(fā)數(shù):' + str(reposts_count) + '\n')
blog_dict['mblog_id'] = mblog_id
blog_dict['mblog_text'] = mblog_text
blog_dict['create_time'] = create_time
return blog_dict
else:
print('沒有任何微博哦')
return False
第一個參數(shù),接受的值為 res['data']['cards'] 的返回值,是一個字典類型數(shù)據(jù);
第二個參數(shù),是外層調(diào)用函數(shù)的循環(huán)計數(shù)器;
第三個參數(shù),是要爬取的大 V 名稱;
第四個參數(shù),是正在爬取的頁碼。
def get_blog_by_text(containerid, blog_text, name):
blog_list = []
page = 1
while True:
try:
url = 'https://m.weibo.cn/api/container/getIndex?containerid=%s&page=%s' % (containerid, page)
res_code = requests.get(url).status_code
if res_code == 418:
print('訪問太頻繁,過會再試試吧')
return False
res = requests.get(url).json()
cards = res['data']['cards']
if len(cards) > 0:
for i in range(len(cards)):
print('-----正在爬取第' + str(page) + '頁,第' + str(i+1) + '條微博------')
blog_dict = get_blog_info(cards, i, name, page)
blog_list.append(blog_dict)
if blog_list is False:
break
mblog_text = blog_dict['mblog_text']
create_time = blog_dict['create_time']
if blog_text in mblog_text:
print('找到相關(guān)微博')
return blog_dict['mblog_id']
elif checkTime(create_time, config.day) is False:
print('沒有找到相關(guān)微博')
return blog_list
page += 1
time.sleep(config.sleep_time)
else:
print('沒有任何微博哦')
break
except:
pass
def checkTime(inputtime, day):
try:
intime = datetime.datetime.strptime('2019-' + inputtime, '%Y-%m-%d')
except:
return '時間轉(zhuǎn)換失敗'
now = datetime.datetime.now()
n_days = now - intime
days = n_days.days
if days < day:
return True
else:
return False
day = 90 # 最久抓取的微博時間,60即為只抓取兩個月前到現(xiàn)在的微博
sleep_time = 5 # 延遲時間,建議配置5-10s
https://weibo.com/1312412824/HxFY84Gqb?filter=hot&root_comment_id=0&type=comment#_rnd1567155548217
https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4380261561116383&from=singleWeiBo&__rnd=1567155729639
https://weibo.com/aj/v6/comment/big?ajwvr=6&id=%s&page=%s
id 就是要抓取評論的微博對應(yīng)的 id,我們已經(jīng)在上面的接口中拿到了;
page 就是請求頁數(shù)。
def get_comment(self, mblog_id, page):
comment = []
for i in range(0, page):
print('-----正在爬取第' + str(i) + '頁評論')
url = 'https://weibo.com/aj/v6/comment/big?ajwvr=6&id=%s&page=%s' % (mblog_id, i)
req = requests.get(url, headers=self.headers).text
html = json.loads(req)['data']['html']
content = BeautifulSoup(html, 'html.parser')
comment_text = content.find_all('div', attrs={'class': 'WB_text'})
for c in comment_text:
_text = c.text.split(':')[1]
comment.append(_text)
time.sleep(config.sleep_time)
return comment
def download_comment(self, comment):
comment_pd = pd.DataFrame(columns=['comment'], data=comment)
timestamp = str(int(time.time()))
comment_pd.to_csv(timestamp + 'comment.csv', encoding='utf-8')
from weibo_spider import WeiBo
from config import headers
def main(name, spider_type, text, page, iscomment, comment_page):
print('開始...')
weibo = WeiBo(name, headers)
...
if __name__ == '__main__':
target_name = input('type the name: ')
spider_type = input('type spider type(Text or Page): ')
text = '你好'
page_count = 10
iscomment = 'No'
comment_page_count = 100
while spider_type not in ('Text', 'text', 'Page', 'page'):
spider_type = input('type spider type(Text or Page): ')
...
class WeiBo(object):
def __init__(self, name, headers):
self.name = name
self.headers = headers
def get_uid(self): # 獲取用戶的 UID
...
def get_userinfo(self, uid): # 獲取用戶信息,包括 containerid
...
def get_blog_by_page(self, containerid, page, name): # 獲取 page 頁的微博信息
...
def get_blog_by_text(self, containerid, blog_text, name): # 一個簡單的搜索功能,根據(jù)輸入的內(nèi)容查找對應(yīng)的微博
...
def get_comment(self, mblog_id, page): # 與上個函數(shù)配合使用,用于獲取某個微博的評論
...
def download_comment(self, comment): # 下載評論
...
import datetime
from config import day
def checkTime(inputtime, day):
...
def get_blog_info(cards, i, name, page):
...
http://api.dongqiudi.com/search?keywords=&type=all&page=
id 是對應(yīng)的每個網(wǎng)頁的 id;
thumb 是女神的封面圖片;
url 對應(yīng)的也是女神所在頁面的地址信息。
def get_list(page):
nvshen_id_list = []
nvshen_id_picture = []
for i in range(1, page):
print('獲取第' + str(i) + '頁數(shù)據(jù)')
url = 'http://api.dongqiudi.com/search?keywords=%E5%A5%B3%E7%A5%9E%E5%A4%A7%E4%BC%9A&type=all&page=' + str(i)
html = requests.get(url=url).text
news = json.loads(html)['news']
if len(news) == 0:
print('沒有更多啦')
break
nvshen_id = [k['id'] for k in news]
nvshen_id_list = nvshen_id_list + nvshen_id
nvshen_id_picture = nvshen_id_picture + [{k['id']: k['thumb']} for k in news]
time.sleep(1)
return nvshen_id_list, nvshen_id_picture
http://www.dongqiudi.com/news/1193890
def download_page(nvshen_id_list):
for i in nvshen_id_list:
print('正在下載ID為' + i + '的HTML網(wǎng)頁')
url = 'https://www.dongqiudi.com/news/%s' % i
download = DownloadPage()
html = download.getHtml(url)
download.saveHtml(i, html)
time.sleep(2)
class DownloadPage(object):
def getHtml(self, url):
html = requests.get(url=url, cookies=config.session, headers=config.header).content
return html
def saveHtml(self, file_name, file_content):
with open('html_page/' + file_name + '.html', 'wb') as f:
f.write(file_content)
session = {
'dqduid': 'yours'}
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win32; x32; rv:54.0) Gecko/20100101 Firefox/54.0',
'Connection': 'keep-alive'}
content.find_all('span', attrs={'style': 'color:#ff0000'})
但是有些頁面的規(guī)則不同,所以會存在無法解析一部分 HTML 文件,對于無法處理的文件,由于也不多,就手工處理了。
def deal_loaclfile(nvshen_id_picture):
files = os.listdir('html_page/')
nvshen_list = []
special_page = []
for f in files:
...
return nvshen_list, special_page
def get_picture(c, t_list, n_id_p):
nvshen_l = []
tmp_prev_id = c.find_all('a', attrs={'target': '_self'})
for j in tmp_prev_id:
...
w_list = ['吳宣儀', '30萬', '826965', '68', '825847',
'https://img1.dongqiudi.com/fastdfs3/M00/74/54/180x135/crop/-/ChOxM1vIPpOAZT8AAAHza_WMyRk175.png']
g_list = ['關(guān)之琳', '20萬', '813611', '88', '812559',
'https://img1.dongqiudi.com/fastdfs3/M00/6B/94/180x135/crop/-/ChOxM1u1gx2AZ7qmAABi3gRdHS8715.jpg']
t_list = ['佟麗婭', '22萬', '797779', '93', '795697',
'https://img1.dongqiudi.com/fastdfs3/M00/60/A7/180x135/crop/-/ChOxM1ufUh2AJdR0AABXtcU22fg956.jpg']
y_list = ['楊丞琳', '7萬', '1173681', '45', '1168209',
'https://img1.qunliao.info/fastdfs4/M00/CA/F7/ChMf8F0pTOKAaefqAA5nOMM0LK0171.jpg']
https://movie.douban.com/celebrity/1166896/photos/?type=C&start=30&sortby=like&size=a&subtype=a
a 標(biāo)簽中的鏈接可以得到每張圖片的評論信息;
img 標(biāo)簽中的鏈接可以用來保存女神的海報。
def get_posters():
comment_url_list = []
picture_list = []
for i in range(0, 40000, 30):
url = 'https://movie.douban.com/celebrity/1166896/photos/?type=C&start=%s&sortby=like&size=a&subtype=a' % str(i)
req = requests.get(url).text
content = BeautifulSoup(req, 'html.parser')
chekc_point = content.find('span', attrs={'class': 'next'}).find('a')
if chekc_point != None:
data = content.find_all('div', attrs={'class': 'cover'})
for k in data:
ulist = k.find('a')['href']
plist = k.find('img')['src']
comment_url_list.append(ulist)
picture_list.append(plist)
else:
break
return comment_url_list, picture_list
之后,就可以下載海報了。
def get_comment(comment_l):
client = pymongo.MongoClient('mongodb://douban:douban1989@ds149744.mlab.com:49744/douban')
db = client.douban
mongo_collection = db.comment
comment_list = []
comment = []
print('Save to MongoDB')
for i in comment_l:
response = requests.get(i).text
content = BeautifulSoup(response, 'html.parser')
tmp_list = content.find_all('div', attrs={'class': 'comment-item'})
comment_list = comment_list + tmp_list
for k in comment_list:
tmp_comment = k.find('p').text
mongo_collection.insert_one({'comment': tmp_comment})
comment.append(tmp_comment)
print('Save Finish!')
client = pymongo.MongoClient('mongodb://douban:douban1989@ds149744.mlab.com:49744/douban')
db = client.douban
mongo_collection = db.comment
聯(lián)系客服