这篇文章给大家介绍怎么在scrapy中利用phantomJS实现异步爬取,内容非常详细,感兴趣的小伙伴们可以参考借鉴,希望对大家能有所帮助。
使用时需要PhantomJSDownloadHandler添加到配置文件的DOWNLOADER中。
#encoding:utf-8
from__future__importunicode_literals
fromscrapyimportsignals
fromscrapy.signalmanagerimportSignalManager
fromscrapy.responsetypesimportresponsetypes
fromscrapy.xlib.pydispatchimportdispatcher
fromseleniumimportwebdriver
fromsix.movesimportqueue
fromtwisted.internetimportdefer,threads
fromtwisted.python.failureimportFailure
classPhantomJSDownloadHandler(object):
def__init__(self,settings):
self.options=settings.get('PHANTOMJS_OPTIONS',{})
max_run=settings.get('PHANTOMJS_MAXRUN',10)
self.sem=defer.DeferredSemaphore(max_run)
self.queue=queue.LifoQueue(max_run)
SignalManager(dispatcher.Any).connect(self._close,signal=signals.spider_closed)
defdownload_request(self,request,spider):
"""usesemaphoretoguardaphantomjspool"""
returnself.sem.run(self._wait_request,request,spider)
def_wait_request(self,request,spider):
try:
driver=self.queue.get_nowait()
exceptqueue.Empty:
driver=webdriver.PhantomJS(**self.options)
driver.get(request.url)
#ghostdriverwon'tresponsewhenswitchwindowuntilpageisloaded
dfd=threads.deferToThread(lambda:driver.switch_to.window(driver.current_window_handle))
dfd.addCallback(self._response,driver,spider)
returndfd
def_response(self,_,driver,spider):
body=driver.execute_script("returndocument.documentElement.innerHTML")
ifbody.startswith("<head></head>"):#cannotaccessresponseheaderinSelenium
body=driver.execute_script("returndocument.documentElement.textContent")
url=driver.current_url
respcls=responsetypes.from_args(url=url,body=body[:100].encode('utf8'))
resp=respcls(url=url,body=body,encoding="utf-8")
response_failed=getattr(spider,"response_failed",None)
ifresponse_failedandcallable(response_failed)andresponse_failed(resp,driver):
driver.close()
returndefer.fail(Failure())
else:
self.queue.put(driver)
returndefer.succeed(resp)
def_close(self):
whilenotself.queue.empty():
driver=self.queue.get_nowait()
driver.close()
关于怎么在scrapy中利用phantomJS实现异步爬取就分享到这里了,希望以上内容可以对大家有一定的帮助,可以学到更多知识。如果觉得文章不错,可以把它分享出去让更多的人看到。