作者:蔣蜀黍
使用Scrapy框架進(jìn)行爬取伯樂在線的所有技術(shù)文章
所用知識(shí)點(diǎn)
Scrapy項(xiàng)目的創(chuàng)建
Scrapy框架Shell命令的使用
Scrapy自帶的圖片下載管道
Scrapy自定義圖片下載管道(繼承自帶的管道)
Scrapy框架ItemLoader的使用
Scrapy自定義ItemLoader
Scrapy中同步將Item保存入Mysq數(shù)據(jù)庫
Scrapy中異步將Item保存入Mysq數(shù)據(jù)庫
scrapy startproject bole
scrapy genspider jobbole blog.jobbole.com
為了方便對(duì)爬蟲進(jìn)行調(diào)試,在項(xiàng)目目錄中創(chuàng)建一個(gè)main.py文件
from scrapy.cmdline import execute
import sys,os
# 將項(xiàng)目目錄動(dòng)態(tài)設(shè)置到環(huán)境變量中
# os.path.abspath(__file__) 獲取main.py的路徑
# os.path.dirname(os.path.abspath(__file__) 獲取main.py所處目錄的上一級(jí)目錄
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(['scrapy','crawl','jobbole'])
在爬蟲開始運(yùn)行時(shí),建議修改項(xiàng)目中的配置文件,找到ROBOTSTXT_OBEY
將其改為False,如果不修改的話,Scrapy會(huì)自動(dòng)的查找網(wǎng)站的ROBOTS協(xié)議,會(huì)過濾不符合協(xié)議的URL
在windows環(huán)境下可能會(huì)出現(xiàn)No moudle named 'win32api'
,因此需要執(zhí)行pip install pypiwin32
如果下載速度過慢可使用豆瓣源進(jìn)行安裝
pip install -i https://pypi.douban.com/simple pypiwin32
在解析頁面的時(shí)候如果要查看運(yùn)行結(jié)果則必須要運(yùn)行Scrapy爬蟲發(fā)起一個(gè)請(qǐng)求,而Scrapy提供了一種方便的調(diào)試方法可以只請(qǐng)求一次。
scrpay shell http://blog.jobbole.com/111144/
def parse_detail(self, response):
# xpath方式進(jìn)行解析
# 文章標(biāo)題
title = response.xpath('//div[@class='entry-header']/h1/text()').extract_first()
# 發(fā)布時(shí)間
create_time = response.xpath('//p[@class='entry-meta-hide-on-mobile']/text()').extract_first().replace('·','').strip()
# 點(diǎn)贊數(shù)
# contains函數(shù)是找到class中存在vote-post-up這個(gè)類
up_num = response.xpath('//span[contains(@class,'vote-post-up')]/h10/text()').extract_first()
# 收藏?cái)?shù)
fav_num = response.xpath('//span[contains(@class,'bookmark-btn')]/text()').extract_first()
match_re = re.match('.*?(\d ).*',fav_num)
if match_re:
fav_num = match_re.group(1)
else:
fav_num = 0
# 評(píng)論數(shù)
comment_num = response.xpath('//a[@href='#article-comment']/span/text()').extract_first()
match_re = re.match('.*?(\d ).*', comment_num)
if match_re:
comment_num = match_re.group(1)
else:
comment_num = 0
# 文章正文
content = response.xpath('//div[@class='entry']').extract_first()
# 獲取標(biāo)簽
tags_list = response.xpath('//p[@class='entry-meta-hide-on-mobile']/a/text()').extract()
tags_list = [element for element in tags_list if not element.strip().endswith('評(píng)論')]
tags = ','.join(tags_list)
def parse_detail(self, response):
# CSS方式進(jìn)行解析
# 文章標(biāo)題
title = response.css('div.entry-header h1::text').extract_first()
# 發(fā)布時(shí)間
create_time = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip()
# 點(diǎn)贊數(shù)
up_num = response.css('span.vote-post-up h10::text').extract_first()
# 收藏?cái)?shù)
fav_num = response.css('span.bookmark-btn::text').extract_first()
match_re = re.match('.*?(\d ).*',fav_num)
if match_re:
fav_num = match_re.group(1)
else:
fav_num = 0
# 評(píng)論數(shù)
comment_num = response.css('a[href='#article-comment'] span::text').extract_first()
match_re = re.match('.*?(\d ).*', comment_num)
if match_re:
comment_num = match_re.group(1)
else:
comment_num = 0
# 文章正文
content = response.css('div.entry').extract_first()
# 獲取標(biāo)簽
tags_list = response.css('p.entry-meta-hide-on-mobile a::text').extract()
tags_list = [element for element in tags_list if not element.strip().endswith('評(píng)論')]
tags = ','.join(tags_list)
def parse(self, response):
# 獲取文章列表中的每一篇文章的url交給Scrapy下載并解析
article_nodes = response.css('div#archive .floated-thumb .post-thumb a')
for article_node in article_nodes:
# 解析每個(gè)文章的封面圖
font_image_url = article_node.css('img::attr(src)').extract_first('')
# 解析每個(gè)文章的url
article_url = article_node.css('::attr(href)').extract_first('')
# 智能對(duì)url進(jìn)行拼接,如果url中不帶有域名則會(huì)自動(dòng)添加域名
# 通過在Request中設(shè)置meta信息來進(jìn)行數(shù)據(jù)的傳遞
yield Request(url=parse.urljoin(response.url, article_url),meta={'font_image_url':parse.urljoin(response.url, font_image_url)}, callback=self.parse_detail)
# 獲取文章的下一頁url地址,并交給自身解析
next_url = response.css('a.next.page-numbers::attr(href)').extract_first('')
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
article_item = JobBoleArticleItem()
# 從response中獲取數(shù)據(jù)
# 文章封面圖
font_image_url = response.meta.get('font_image_url', '')
# CSS方式進(jìn)行解析
# 文章標(biāo)題
title = response.css('div.entry-header h1::text').extract_first()
# 發(fā)布時(shí)間
create_time = response.css('p.entry-meta-hide-on-mobile::text').extract_first().replace('·','').strip()
# 點(diǎn)贊數(shù)
up_num = response.css('span.vote-post-up h10::text').extract_first()
# 收藏?cái)?shù)
fav_num = response.css('span.bookmark-btn::text').extract_first()
match_re = re.match('.*?(\d ).*',fav_num)
if match_re:
fav_num = match_re.group(1)
else:
fav_num = 0
# 評(píng)論數(shù)
comment_num = response.css('a[href='#article-comment'] span::text').extract_first()
match_re = re.match('.*?(\d ).*', comment_num)
if match_re:
comment_num = match_re.group(1)
else:
comment_num = 0
# 文章正文
content = response.css('div.entry').extract_first()
# 獲取標(biāo)簽
tags_list = response.css('p.entry-meta-hide-on-mobile a::text').extract()
tags_list = [element for element in tags_list if not element.strip().endswith('評(píng)論')]
tags = ','.join(tags_list)
article_item['title'] = title
article_item['create_time'] = create_time
article_item['url'] = response.url
article_item['font_image_url'] = [font_image_url]
article_item['up_num'] = up_num
article_item['fav_num'] = fav_num
article_item['comment_num'] = comment_num
article_item['content'] = content
article_item['tags'] = tags
yield article_item
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
create_time = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
font_image_url = scrapy.Field()
font_image_path = scrapy.Field()
up_num = scrapy.Field()
fav_num = scrapy.Field()
comment_num = scrapy.Field()
tags = scrapy.Field()
content = scrapy.Field()
在settings.py中的pipeline處添加 scrapy.pipeline.images.ImagesPipeline
ITEM_PIPELINES = {
'bole.pipelines.BolePipeline': 300,
'scrapy.pipeline.images.ImagesPipeline' : 200
}
# 設(shè)置圖片url的字段,scraoy將從item中找出此字段進(jìn)行圖片下載
IMAGES_URLS_FIELD = 'font_image_url'
# 設(shè)置圖片下載保存的目錄
project_path = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_path, 'images')
# 表示只下載大于100x100的圖片
IMAGES_MIN_HEIGHT = 100
IMAGES_MIN_WIDTH = 100
之后運(yùn)行項(xiàng)目可能包PIL未找到,因此需要
pip install pillow
此外scrapy的圖片下載默認(rèn)是接受一個(gè)數(shù)組,因此在賦值的時(shí)候需要
article_item['font_image_url'] = [font_image_url]
雖然Scrapy自帶的下載中間件很好用,但是如果我要獲取圖片下載后保存的路徑則官方自帶就不能滿足需求,因此需要我們自定義管道
# 自定義圖片下載處理的中間件
class ArticleImagePipeline(ImagesPipeline):
# 重載函數(shù),改寫item處理完成的函數(shù)
def item_completed(self, results, item, info):
for key, value in results:
font_image_path = value['path']
item['font_image_path'] = font_image_path
return item
from scrapy.exporters import JsonItemExporter
# 使用Scrapy自帶的JsonExporter將item導(dǎo)出為json
class JsonExportPipeline(object):
# 調(diào)用scrapy提供的JsonExporter導(dǎo)出json文件
def __init__(self):
self.file = open('article_export.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False)
self.exporter.start_exporting()
# 重寫Item處理
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def spider_closed(self, spider):
self.exporter.finish_exporting()
self.file.close()
import codecs,json
# 自定義將Item導(dǎo)出為Json的管道
class ArticleWithJsonPipeline(object):
# 爬蟲初始化時(shí)調(diào)用
def __init__(self):
# 打開json文件
# 使用codecs能夠解決編碼方面的問題
self.file = codecs.open('article.json','w',encoding='utf-8')
# 重寫Item處理
def process_item(self, item, spider):
# 需要關(guān)閉ensure_ascii,不然中文字符會(huì)顯示不正確
lines = json.dump(dict(item), ensure_ascii=False) '\n'
# 將一行數(shù)據(jù)寫入
self.file.write(lines)
return item
# 爬蟲結(jié)束時(shí)調(diào)用
def spider_closed(self, spider):
# 關(guān)閉文件句柄
self.file.close()
pip install mysqlclient 安裝Mysql客戶端庫
import MySQLdb
# 同步機(jī)制寫入數(shù)據(jù)庫
class ArticleWithMysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', 'root', 'scrapy', charset='utf8', use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = '''
INSERT INTO
jobbole_article (title, create_time, url, url_object_id, font_image_url, comment_num, up_num, fav_num, tags, content)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''
self.cursor.execute(insert_sql, (item['title'], item['create_time'], item['url'], item['url_object_id'], item['font_image_url'][0],
item['comment_num'], item['up_num'], item['fav_num'], item['tags'], item['content']))
self.conn.commit()
def spider_closed(self, spider):
self.conn.close()
因?yàn)镾crapy的解析速度非???,加上文章的內(nèi)容較大,因此會(huì)出現(xiàn)數(shù)據(jù)庫的操作速度趕不上解析速度會(huì)產(chǎn)生阻塞,因此采用異步化的方式來進(jìn)行數(shù)據(jù)的插入
import MySQLdb.cursors
from twisted.enterprise import adbapi
# 異步操作寫入數(shù)據(jù)庫
class ArticleTwiterMysqlPipeline(object):
# scrapy會(huì)自動(dòng)執(zhí)行此方法,將setting文件中的配置讀入
@classmethod
def from_settings(cls, settings):
param = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USERNAME'],
passwd = settings['MYSQL_PASSWORD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True
)
#需要使用連接模塊的模塊名
dbpool = adbapi.ConnectionPool('MySQLdb', **param)
return cls(dbpool)
def __init__(self, dbpool):
self.dbpool = dbpool
# 使用twisted異步將數(shù)據(jù)插入到數(shù)據(jù)庫中
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider)
# 自定義錯(cuò)誤處理
def handle_error(self, failure, item, spider):
print(failure)
print(item)
def do_insert(self, cursor, item):
insert_sql = '''
INSERT INTO
jobbole_article (title, create_time, url, url_object_id, font_image_url, font_image_path, comment_num, up_num, fav_num, tags, content)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''
cursor.execute(insert_sql, (item['title'], item['create_time'], item['url'], item['url_object_id'], item['font_image_url'][0],
item['font_image_path'], item['comment_num'], item['up_num'], item['fav_num'], item['tags'], item['content']))
前面使用了最基本的方式來解析的文章詳情頁,這樣使得spider的代碼十分長,不容易維護(hù),因此可以采用自定義ItemLoder的方式方便對(duì)規(guī)則的管理
class JobboleSpider(scrapy.Spider):
# 爬蟲的名稱 后續(xù)啟動(dòng)爬蟲是采用此名稱
name = 'jobbole'
# 爬取允許的域名
allowed_domains = ['blog.jobbole.com']
# 起始url列表 , 其中的每個(gè)URL會(huì)進(jìn)入下面的parse函數(shù)進(jìn)行解析
start_urls = ['http://blog.jobbole.com/all-posts/']
# 列表頁面的解析
def parse(self, response):
# 獲取文章列表中的每一篇文章的url交給Scrapy下載并解析
article_nodes = response.css('div#archive .floated-thumb .post-thumb a')
for article_node in article_nodes:
# 解析每個(gè)文章的封面圖
font_image_url = article_node.css('img::attr(src)').extract_first('')
# 解析每個(gè)文章的url
article_url = article_node.css('::attr(href)').extract_first('')
# 智能對(duì)url進(jìn)行拼接,如果url中不帶有域名則會(huì)自動(dòng)添加域名
# 通過在Request中設(shè)置meta信息來進(jìn)行數(shù)據(jù)的傳遞
yield Request(url=parse.urljoin(response.url, article_url),meta={'font_image_url':parse.urljoin(response.url, font_image_url)}, callback=self.parse_detail)
# 獲取文章的下一頁url地址,并交給自身解析
next_url = response.css('a.next.page-numbers::attr(href)').extract_first('')
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
# 詳情頁面的解析
def parse_detail(self, response):
article_item = JobBoleArticleItem()
# 從response中獲取文章封面圖
font_image_url = response.meta.get('font_image_url', '')
item_loader = JobBoleArticleItemLoader(item=JobBoleArticleItem(),response=response)
item_loader.add_css('title', 'div.entry-header h1::text')
item_loader.add_css('create_time', 'p.entry-meta-hide-on-mobile::text')
item_loader.add_value('url', response.url)
item_loader.add_value('url_object_id', get_md5(response.url))
item_loader.add_value('font_image_url', [font_image_url])
item_loader.add_css('comment_num', 'a[href='#article-comment'] span::text')
item_loader.add_css('content', 'div.entry')
item_loader.add_css('tags', 'p.entry-meta-hide-on-mobile a::text')
item_loader.add_css('up_num', '.vote-post-up h10')
item_loader.add_css('fav_num', 'div.post-adds > span.btn-bluet-bigger.href-style.bookmark-btn.register-user-only::text')
article_item = item_loader.load_item()
yield article_item
import datetime
import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
# 去除文本中的點(diǎn)
def remove_dote(value):
return value.replace('·','').strip()
# 時(shí)間轉(zhuǎn)換處理
def date_convert(value):
try:
create_time = datetime.datetime.strptime(value, '%Y/%m/%d').date()
except Exception as e:
create_time = datetime.datetime.now().date()
return create_time
# 獲得數(shù)字
def get_num(value):
match_re = re.match('.*?(\d ).*', value)
if match_re:
num = match_re.group(1)
else:
num = 0
return int(num)
# 獲取點(diǎn)贊數(shù)
def get_up_num(value):
match_re = re.match('<h10 id='.*?'>(\d )</h10>', value)
if match_re:
num = match_re.group(1)
else:
num = 0
return int(num)
# 去掉tag中的評(píng)論
def remove_comment_tag(value):
if '評(píng)論' in value:
return ''
return value
# 默認(rèn)返回
def return_value(value):
return value
# 自定義ITemLoader
class JobBoleArticleItemLoader(ItemLoader):
# 改寫默認(rèn)的output_processor
default_output_processor = TakeFirst()
# 伯樂在線Item
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
create_time = scrapy.Field(
# 該傳入的字段值要批量處理的函數(shù)
input_processor=MapCompose(remove_dote, date_convert),
)
url = scrapy.Field()
url_object_id = scrapy.Field()
font_image_url = scrapy.Field(
output_processor = MapCompose(return_value)
)
font_image_path = scrapy.Field()
up_num = scrapy.Field(
input_processor = MapCompose(get_up_num)
)
fav_num = scrapy.Field(
input_processor=MapCompose(get_num),
)
comment_num = scrapy.Field(
input_processor=MapCompose(get_num),
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tag),
output_processor = Join(',')
)
content = scrapy.Field()
```
關(guān)注公眾號(hào),“Python愛好者社區(qū)”,回復(fù)“爬蟲”即可獲取崔老師爬蟲免費(fèi)學(xué)習(xí)視頻。
為大家提供與Python相關(guān)的最新技術(shù)和資訊。
聯(lián)系客服