使用Puppeteer爬虫 发表于 2018-11-08 Puppeteer是 GoogleChrome团队官方的无界面(Headless)Chrome工具,它是一个Node库,提供了一个高级的API来控制DevTool协议上的无头版 Chrome 。也可以配置为使用完整(非无头)的 Chrome。123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051const puppeteer = require('puppeteer')const cheerio = require('cheerio')const fs = require('fs')const assert = require('assert')async function start() { const browser = await puppeteer.launch() const page = await browser.newPage() //过滤请求资源// await page.setRequestInterception(true)// page.on('request', interceptedRequest => {// if (interceptedRequest.url().endsWith('.png') ||// interceptedRequest.url().endsWith('.jpg') ||// interceptedRequest.url().endsWith('.gif'))// interceptedRequest.abort()// else// interceptedRequest.continue()// }) await page.goto('http://jandan.net/ooxx') const $ = cheerio.load(await page.content()) const $img = $('img') for (let index in $img) { try { const content = await getResourceContent(page, ($img[index].attribs.src)) const contentBuffer = Buffer.from(content, 'base64') const fileName = $img[index].attribs.src.match( /^http(s)?:\/\/(.+)\/(.+)\/(.+\..+)/)[4] fs.writeFileSync(fileName, contentBuffer, 'base64') } catch (e) {} } await browser.close()}//获得资源树async function getResourceTree(page) { var resource = await page._client.send('Page.getResourceTree') return resource.frameTree}//根据url frameId查找资源实例async function getResourceContent(page, url) { const {content, base64Encoded} = await page._client.send( 'Page.getResourceContent', {frameId: String(page.mainFrame()._id), url}, ) assert.strictEqual(base64Encoded, true) return content}start()