最新消息: 电脑我帮您提供丰富的电脑知识,编程学习,软件下载,win7系统下载。

使用Puppeteer刮取图像的无限滚动网页但在等待异步中没有返回任何内容

IT培训 admin 7浏览 0评论

使用Puppeteer刮取图像的无限滚动网页但在等待异步中没有返回任何内容

所以我正在使用node.js和Puppeteer尝试从无限滚动的网页中抓取一定数量的图像URL(我只是使用reddit主页进行测试,但如果你想要用自己替换),但是应该持有它们的数组是空的。

我甚至把它全部放在一个匿名的异步函数中,所以我可以强制它到await,以防时间问题:

const puppeteer = require('puppeteer');

var pBrowser = await puppeteer.launch();
console.log("\t* Browser launched");
var pPage       = await pBrowser.newPage();
console.log("\t* Page launched");

let sUrl = foo;
await pPage.goto(sUrl);

let iItemCount = 10;
let tImageSrcs = [];
await async function () {
    let iPreviousHeight;
    console.log("Starting search at the top");
    while (tImageSrcs.length < iItemCount) {
        tImageSrcs = await pPage.evaluate( function() {
            let tItems = Array.from(document.images, e => e.src );
            console.log("\t\t* "+tItems.length+" images overall found within this section; trimming...");
            tItems = tItems.filter( sImage => [".jpg",".png"].includes(sImage.substring(sImage.length-4)) );
            console.log("\t\t* "+tItems.length+" images after filtering");
            return tImageSrcs.concat(tItems);
        });
        console.log("\t* "+tImageSrcs.length+" appropriate images sourced so far...");
        iPreviousHeight = await pPage.evalutate("document.body.scrollHeight");
        await pPage.evaluate('window.scrollTo(0, document.body.scrollHeight)');
        console.log("Searching at scroll height "+iPreviousHeight);
        await pPage.waitForFunction(`document.body.scrollHeight > ${iPreviousHeight}`);
        await page.waitFor(1000);
    }
};
console.log("\t* "+tImageSrcs.length+" images sourced");

但似乎它甚至没有在异步函数中调用任何东西,因为无论它是否能够找到任何图像,输出甚至不包含任何console.log语句,包括在(几乎)最顶层的语句异步功能:

        * Browser launched
        * Page launched
        * 0 images sourced
回答如下:

一些说明:

  1. 您创建了异步函数,但是您没有调用它,因此它不会被执行:
await async function () { /*...*/ }

应该:

await async function () { /*...*/ }()
  1. 无论如何,这个包装函数是多余的:我想你已经在使用await的异步函数中,所以不需要使用包装器。
  2. console.log("\t* "+tImageSrcs.length+" images sourced");仍然为空时,你调用tImageSrcs,因为上面的函数没有执行(await只是等待它的创建,而不是执行)。
  3. tImageSrcs代码中的pPage.evaluate()未定义(tImageSrcs是Node.js上下文中的变量,它在浏览器上下文中不可用)。您需要通过其可序列化值传输它。

所以试试这个变种:

const puppeteer = require('puppeteer');

var pBrowser = await puppeteer.launch();
console.log("\t* Browser launched");
var pPage       = await pBrowser.newPage();
console.log("\t* Page launched");

let sUrl = foo;
await pPage.goto(sUrl);

let iItemCount = 10;
let tImageSrcs = [];

let iPreviousHeight;
console.log("Starting search at the top");

while (tImageSrcs.length < iItemCount) {
    tImageSrcs = await pPage.evaluate( function(srcs) {
        let tItems = Array.from(document.images, e => e.src );
        console.log("\t\t* "+tItems.length+" images overall found within this section; trimming...");
        tItems = tItems.filter( sImage => [".jpg",".png"].includes(sImage.substring(sImage.length-4)) );
        console.log("\t\t* "+tItems.length+" images after filtering");
        return srcs.concat(tItems);
    }, tImageSrcs);
    console.log("\t* "+tImageSrcs.length+" appropriate images sourced so far...");
    iPreviousHeight = await pPage.evalutate("document.body.scrollHeight");
    await pPage.evaluate('window.scrollTo(0, document.body.scrollHeight)');
    console.log("Searching at scroll height "+iPreviousHeight);
    await pPage.waitForFunction(`document.body.scrollHeight > ${iPreviousHeight}`);
    await page.waitFor(1000);
}

console.log("\t* "+tImageSrcs.length+" images sourced");

使用Puppeteer刮取图像的无限滚动网页但在等待异步中没有返回任何内容

所以我正在使用node.js和Puppeteer尝试从无限滚动的网页中抓取一定数量的图像URL(我只是使用reddit主页进行测试,但如果你想要用自己替换),但是应该持有它们的数组是空的。

我甚至把它全部放在一个匿名的异步函数中,所以我可以强制它到await,以防时间问题:

const puppeteer = require('puppeteer');

var pBrowser = await puppeteer.launch();
console.log("\t* Browser launched");
var pPage       = await pBrowser.newPage();
console.log("\t* Page launched");

let sUrl = foo;
await pPage.goto(sUrl);

let iItemCount = 10;
let tImageSrcs = [];
await async function () {
    let iPreviousHeight;
    console.log("Starting search at the top");
    while (tImageSrcs.length < iItemCount) {
        tImageSrcs = await pPage.evaluate( function() {
            let tItems = Array.from(document.images, e => e.src );
            console.log("\t\t* "+tItems.length+" images overall found within this section; trimming...");
            tItems = tItems.filter( sImage => [".jpg",".png"].includes(sImage.substring(sImage.length-4)) );
            console.log("\t\t* "+tItems.length+" images after filtering");
            return tImageSrcs.concat(tItems);
        });
        console.log("\t* "+tImageSrcs.length+" appropriate images sourced so far...");
        iPreviousHeight = await pPage.evalutate("document.body.scrollHeight");
        await pPage.evaluate('window.scrollTo(0, document.body.scrollHeight)');
        console.log("Searching at scroll height "+iPreviousHeight);
        await pPage.waitForFunction(`document.body.scrollHeight > ${iPreviousHeight}`);
        await page.waitFor(1000);
    }
};
console.log("\t* "+tImageSrcs.length+" images sourced");

但似乎它甚至没有在异步函数中调用任何东西,因为无论它是否能够找到任何图像,输出甚至不包含任何console.log语句,包括在(几乎)最顶层的语句异步功能:

        * Browser launched
        * Page launched
        * 0 images sourced
回答如下:

一些说明:

  1. 您创建了异步函数,但是您没有调用它,因此它不会被执行:
await async function () { /*...*/ }

应该:

await async function () { /*...*/ }()
  1. 无论如何,这个包装函数是多余的:我想你已经在使用await的异步函数中,所以不需要使用包装器。
  2. console.log("\t* "+tImageSrcs.length+" images sourced");仍然为空时,你调用tImageSrcs,因为上面的函数没有执行(await只是等待它的创建,而不是执行)。
  3. tImageSrcs代码中的pPage.evaluate()未定义(tImageSrcs是Node.js上下文中的变量,它在浏览器上下文中不可用)。您需要通过其可序列化值传输它。

所以试试这个变种:

const puppeteer = require('puppeteer');

var pBrowser = await puppeteer.launch();
console.log("\t* Browser launched");
var pPage       = await pBrowser.newPage();
console.log("\t* Page launched");

let sUrl = foo;
await pPage.goto(sUrl);

let iItemCount = 10;
let tImageSrcs = [];

let iPreviousHeight;
console.log("Starting search at the top");

while (tImageSrcs.length < iItemCount) {
    tImageSrcs = await pPage.evaluate( function(srcs) {
        let tItems = Array.from(document.images, e => e.src );
        console.log("\t\t* "+tItems.length+" images overall found within this section; trimming...");
        tItems = tItems.filter( sImage => [".jpg",".png"].includes(sImage.substring(sImage.length-4)) );
        console.log("\t\t* "+tItems.length+" images after filtering");
        return srcs.concat(tItems);
    }, tImageSrcs);
    console.log("\t* "+tImageSrcs.length+" appropriate images sourced so far...");
    iPreviousHeight = await pPage.evalutate("document.body.scrollHeight");
    await pPage.evaluate('window.scrollTo(0, document.body.scrollHeight)');
    console.log("Searching at scroll height "+iPreviousHeight);
    await pPage.waitForFunction(`document.body.scrollHeight > ${iPreviousHeight}`);
    await page.waitFor(1000);
}

console.log("\t* "+tImageSrcs.length+" images sourced");
发布评论

评论列表 (0)

  1. 暂无评论