scrapy初級(jí)-創(chuàng)新互聯(lián)

scrapy 補(bǔ)充方法 路徑拼接

responses.urljoin(不完整的鏈接)

成都服務(wù)器托管,創(chuàng)新互聯(lián)建站提供包括服務(wù)器租用、光華機(jī)房服務(wù)器托管、帶寬租用、云主機(jī)、機(jī)柜租用、主機(jī)租用托管、CDN網(wǎng)站加速、空間域名等業(yè)務(wù)的一體化完整服務(wù)。電話咨詢:18982081108多字典時(shí)使用

管道中判斷是否為某一個(gè)字典

?if isinstance(item,字典名)

保存

?return item

判斷網(wǎng)站ip類型,從而掛ip類型

if request.url.startswitch(''http://"):

???????request.meta["proxy"]='http://+ip'

elif ...........('https://'):

???????......='https://+ip'

pipeline 管道

class 管道名(object):

  1. ? def __init__(self):

? self.文件=open("文件名.后綴",'寫入方式',encoding="utf_8")

  1. ? def open_spider(self,spider):

self.文件=open("文件名.后綴",'寫入方式',encoding="utf_8")

2.? ? ? ? ? ? ?def process_item(self,spider):

??????????????????????#??管道中判斷是否為某一個(gè)字典

?????????????????????????if isinstance(item,字典名)

????????????????????????保存

?????????????????????????return item

?if isinstance(item, 字典名): ??????????

???????????????????????????data = dict(item) ????????

???????????????????????????self.file.write(json.dumps(data, ensure_ascii=False) + ',\n')?????????

?#ensure_ascii=False:這是因?yàn)閖son.dumps?序列化時(shí)對(duì)中文默認(rèn)使用的ascii編碼.想輸出真正的中文需要指定ensure_ascii=False

??????????????????????????

? 3.? ? ? ? ? ? ? def close_spider(self,spider):

??self.文件.close()#關(guān)閉文件

? 3.? ? ? ? ? ? ? def __del__(self):

?self.文件.close()#關(guān)閉文件

setting文件 代理及user_agent

DOWNLOADER_MIDDLEWARES = {
# 'demo_58.middlewares.Demo58DownloaderMiddleware': 543,
#隨機(jī)ua
'demo_58.middlewares.UserAgentDownloadMiddleware': 543,??
#隨機(jī)ip
'demo_58.middlewares.RandomProxy': 542,
}

請(qǐng)求頭

DEFAULT_REQUEST_HEADERS = {
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'referer': 'https://cs.58.com/chuzu/?PGTID=0d100000-0019-e694-7c7e-9295d99e15c1&ClickID=2'
}

middlewares文件 隨機(jī)ip
class RandomProxy:
    ip_list = [
        '124.116.116.13:4228',
        '122.194.194.139:4212',
        '36.42.248.45:4215',
        '1.83.250.183:4228',
        '49.85.43.175:4223',
        '121.205.229.70:4231',

    ]

??? # 方法名是scrapy規(guī)定的方法 (協(xié)商機(jī)制)

    def process_request(self, request, spider):
        proxy = random.choice(self.ip_list)

        # 修改請(qǐng)求的元數(shù)據(jù)字典
        # 如果是將IP以列表隨機(jī)形式構(gòu)造 需要加上https://,否則報(bào)錯(cuò)
        request.meta['proxy'] = 'https://' + proxy

        # 如果是將IP以字典形式構(gòu)造
        print('IP:', request.meta)
隨機(jī)user_agent

class UserAgentDownloadMiddleware:
    user_agent = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) ????????Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) ????????Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:77.0) Gecko/20190101 Firefox/77.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like ????????Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16.2'
    ]

    # 方法名是scrapy規(guī)定的方法 (協(xié)商機(jī)制)
    # 每個(gè)交給下載器的request對(duì)象都會(huì)經(jīng)過(guò)該方法,并期望返回response
    def process_request(self, request, spider):
        # 獲取隨機(jī)請(qǐng)求頭
        u_a = random.choice(self.user_agent)
        # 設(shè)置請(qǐng)求頭
        request.headers['User-Agent'] = u_a
item字典
# 租房
class Demo58Item_zufang(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()  # 租房標(biāo)題
    price = scrapy.Field()  # 租房?jī)r(jià)格

# 二手房
class Demo58Item_ershou(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()  # 二手標(biāo)題
    price = scrapy.Field()  # 二手價(jià)格
spider文件

import scrapy
導(dǎo)入字典
from demo_58.items import Demo58Item_zufang,Demo58Item_ershou

class SpiderSpider(scrapy.Spider):
name = 'spider'

#限制域名
allowed_domains = ['58.com']

#限制網(wǎng)址
start_urls = ['http://58.com/']

def parse(self, response):
# 解析鏈接
links = response.xpath('//div[@]//span[@]/a/@href').extract()
??

for link in links:
# response.urljoin() 自動(dòng)拼接鏈接
href = response.urljoin(link)

? #指定爬取的子網(wǎng)頁(yè)
if '58.com/chuzu/' in href:
???? yield scrapy.Request(url=href,callback=self.get_zufang_data)
if '58.com/ershoufang/' in href:
???? yield scrapy.Request(url=href, callback=self.get_ershoufang_data)

'''解析信息'''
def get_zufang_data(self,response):
title? = response.xpath('//div[@]/h2/a/text()').extract()? # 標(biāo)題
preice? = response.xpath('//div[@]//div[@]/b/text()').extract()? ????????# 價(jià)格
?

for titles,preices in zip(title,preice):
# 實(shí)例化租房的item類
zufang_item = Demo58Item_zufang()

zufang_item['title'] =titles
zufang_item['price'] =preices
# 返回給引擎
yield zufang_item

# 構(gòu)造翻頁(yè)
for i in range(2,12):
print('當(dāng)前正在下載租房的第{}頁(yè)'.format(i))
z_next_url = 'https://cs.58.com/chuzu/pn{}/'.format(i)
# 將翻頁(yè)Url打包成請(qǐng)求對(duì)象給引擎

? #callback=交給誰(shuí)去執(zhí)行
yield scrapy.Request(url=z_next_url,callback=self.get_zufang_data)

'''解析信息'''
def get_ershoufang_data(self,response):
title = response.xpath('//div[@]/div[@]/h3/text()').extract()? # 標(biāo)題
preice = response.xpath('//p[@]/span[@]/text()').extract()? # 價(jià)格

for titles, preices in zip(title, preice):
# 實(shí)例化二手房的Item類
ershou_item = Demo58Item_ershou()
ershou_item['title'] = titles.strip()
ershou_item['price'] = preices
# 返回給引擎
yield ershou_item

# 開(kāi)始二手房翻頁(yè)
for i in range(2,12):
print('當(dāng)前正在下載二手房的第{}頁(yè)'.format(i))
next_url = 'https://cs.58.com/ershoufang/p{}/'.format(i)
# 將翻頁(yè)Url打包成請(qǐng)求對(duì)象給引擎
yield scrapy.Request(url=next_url, callback=self.get_ershoufang_data)

if __name__ == '__main__':

? #啟動(dòng)文件
from scrapy import cmdline
cmdline.execute(['scrapy', 'crawl', 'spider'])

你是否還在尋找穩(wěn)定的海外服務(wù)器提供商?創(chuàng)新互聯(lián)www.cdcxhl.cn海外機(jī)房具備T級(jí)流量清洗系統(tǒng)配攻擊溯源,準(zhǔn)確流量調(diào)度確保服務(wù)器高可用性,企業(yè)級(jí)服務(wù)器適合批量采購(gòu),新人活動(dòng)首月15元起,快前往官網(wǎng)查看詳情吧

新聞標(biāo)題:scrapy初級(jí)-創(chuàng)新互聯(lián)
標(biāo)題鏈接:http://muchs.cn/article0/dpicio.html

成都網(wǎng)站建設(shè)公司_創(chuàng)新互聯(lián),為您提供App開(kāi)發(fā)、App設(shè)計(jì)、關(guān)鍵詞優(yōu)化、網(wǎng)站設(shè)計(jì)公司、電子商務(wù)網(wǎng)站內(nèi)鏈

廣告

聲明:本網(wǎng)站發(fā)布的內(nèi)容(圖片、視頻和文字)以用戶投稿、用戶轉(zhuǎn)載內(nèi)容為主,如果涉及侵權(quán)請(qǐng)盡快告知,我們將會(huì)在第一時(shí)間刪除。文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如需處理請(qǐng)聯(lián)系客服。電話:028-86922220;郵箱:631063699@qq.com。內(nèi)容未經(jīng)允許不得轉(zhuǎn)載,或轉(zhuǎn)載時(shí)需注明來(lái)源: 創(chuàng)新互聯(lián)

手機(jī)網(wǎng)站建設(shè)