滚动在使用Puppeteer刮擦Google地图时不起作用

发布于 2025-02-02 07:25:28 字数 1634 浏览 4 评论 0原文

我正在抓取Google Maps的数据,但是发生的事情是,它仅返回了我的前10个结果,而不是在此之后返回。我认为滚动功能存在一些问题。

const puppeteer = require('puppeteer');

function extractItems() {
  const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
  const items = [];
  for (let element of extractedElements) {
    items.push(element.innerText);
  }
  return items;
}

async function scrapeItems(
  page,
  extractItems,
  itemCount,
  scrollDelay = 2000,
) {
  let items = [];
  try {
    let previousHeight;
    while (items.length < itemCount) {
      items = await page.evaluate(extractItems);
      previousHeight = await page.evaluate('div.m6QErb.DxyBCb.scrollHeight');//selector for scroller
      await page.evaluate('window.scrollTo(0, div.m6QErb.DxyBCb.scrollHeight)');
      await page.waitForFunction(`div.m6QErb.DxyBCb.scrollHeight > ${previousHeight}`);
      await page.waitForTimeout(scrollDelay);
    }
  } catch(e) { }
  return items;
}

(async () => {
  let browser = await puppeteer.connect();
  browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });
  const [page] = await browser.pages();
  page.setViewport({ width: 1280, height: 926 });

  await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');

  // Auto-scroll and extract desired items from the page. Currently set to extract eight items.
  const items = await scrapeItems(page, extractItems, 30);

  console.log(items)

  await browser.close();
})();

I am scraping google maps places data, but what is happening is that it only returns me the first 10 results of the user reviews, not after that. I think there is some problem with scroll functionality.

const puppeteer = require('puppeteer');

function extractItems() {
  const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
  const items = [];
  for (let element of extractedElements) {
    items.push(element.innerText);
  }
  return items;
}

async function scrapeItems(
  page,
  extractItems,
  itemCount,
  scrollDelay = 2000,
) {
  let items = [];
  try {
    let previousHeight;
    while (items.length < itemCount) {
      items = await page.evaluate(extractItems);
      previousHeight = await page.evaluate('div.m6QErb.DxyBCb.scrollHeight');//selector for scroller
      await page.evaluate('window.scrollTo(0, div.m6QErb.DxyBCb.scrollHeight)');
      await page.waitForFunction(`div.m6QErb.DxyBCb.scrollHeight > ${previousHeight}`);
      await page.waitForTimeout(scrollDelay);
    }
  } catch(e) { }
  return items;
}

(async () => {
  let browser = await puppeteer.connect();
  browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });
  const [page] = await browser.pages();
  page.setViewport({ width: 1280, height: 926 });

  await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');

  // Auto-scroll and extract desired items from the page. Currently set to extract eight items.
  const items = await scrapeItems(page, extractItems, 30);

  console.log(items)

  await browser.close();
})();

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

夜未央樱花落 2025-02-09 07:25:28

此代码正常运行:

'use strict'

const puppeteer = require('puppeteer');
function extractItems() {
  const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
  const items = [];
  for (let element of extractedElements) {
    items.push(element.innerText);
  }
  return items;
}
async function scrapeItems(
  page,
  extractItems,
  itemCount,
  scrollDelay = 2000,
) {
  let items = [];
  try {
    let previousHeight;
    while (items.length < itemCount) {
      console.log(`items.length: ${items.length} itemCount: ${itemCount}`)
      
      items = await page.evaluate(extractItems);

      previousHeight = await page.evaluate(() => { 
        const scroller = document.querySelector('div.m6QErb.DxyBCb') 
        return scroller.scrollHeight  
      })

      await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight})`);
      await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight}`);
      await page.waitForTimeout(scrollDelay);

    }
  } catch(e) { }
  return items;
}


(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });
  const [page] = await browser.pages();
  page.setViewport({ width: 1280, height: 926 });

  await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');

  // Auto-scroll and extract desired items from the page. Currently set to extract eight items.
  const items = await scrapeItems(page, extractItems, 30);

  console.log(items)

await browser.close();
})();

This code is working ok:

'use strict'

const puppeteer = require('puppeteer');
function extractItems() {
  const extractedElements = document.querySelectorAll('.MyEned span.wiI7pd');
  const items = [];
  for (let element of extractedElements) {
    items.push(element.innerText);
  }
  return items;
}
async function scrapeItems(
  page,
  extractItems,
  itemCount,
  scrollDelay = 2000,
) {
  let items = [];
  try {
    let previousHeight;
    while (items.length < itemCount) {
      console.log(`items.length: ${items.length} itemCount: ${itemCount}`)
      
      items = await page.evaluate(extractItems);

      previousHeight = await page.evaluate(() => { 
        const scroller = document.querySelector('div.m6QErb.DxyBCb') 
        return scroller.scrollHeight  
      })

      await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight})`);
      await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight}`);
      await page.waitForTimeout(scrollDelay);

    }
  } catch(e) { }
  return items;
}


(async () => {
  const browser = await puppeteer.launch({
    headless: false,
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });
  const [page] = await browser.pages();
  page.setViewport({ width: 1280, height: 926 });

  await page.goto('https://www.google.com/maps/place/Ace+Florist+%26+Flower+Delivery/@40.8265438,-73.5011026,15z/data=!4m7!3m6!1s0x0:0x9062074cae10c10f!8m2!3d40.8265438!4d-73.5011026!9m1!1b1');

  // Auto-scroll and extract desired items from the page. Currently set to extract eight items.
  const items = await scrapeItems(page, extractItems, 30);

  console.log(items)

await browser.close();
})();
女皇必胜 2025-02-09 07:25:28

因此,我只是发现我在评估时必须添加document.queryselector
滚动高度以及检查滚动高度大于以前的高度。

    items = await page.evaluate(extractItems);
    previousHeight = page.evaluate('document.querySelector("div.m6QErb.DxyBCb").scrollHeight');
    await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight[0]})`);
    await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight[0]}`);
    await page.waitForTimeout(scrollDelay);

So I just found out that I have to add document.querySelector while evaluating
scroll height and also when checking the scroll height is greater than previous Height.

    items = await page.evaluate(extractItems);
    previousHeight = page.evaluate('document.querySelector("div.m6QErb.DxyBCb").scrollHeight');
    await page.evaluate(`document.querySelector("div.m6QErb.DxyBCb").scrollTo(0, ${previousHeight[0]})`);
    await page.waitForFunction(`document.querySelector("div.m6QErb.DxyBCb").scrollHeight > ${previousHeight[0]}`);
    await page.waitForTimeout(scrollDelay);
沫尐诺 2025-02-09 07:25:28

您可以使用以下解决方案之一在Google Maps上使用Puppeteer滚动评论页面。

解决方案#1:

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  while (true) {
    await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

where scrollContainer是带有滚动的元素(在评论页面上,它是.dxybcb)和page> page是Puppeteer的页面。

解决方案2:

async function scrollPage(page, scrollElements) {
  let currentElement = 0;
  while (true) {
    let elementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    for (; currentElement < elementsLength; currentElement++) {
      await page.waitForTimeout(200);
      await page.evaluate(
        (currentElement, scrollElements) => {
          document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
        },
        currentElement,
        scrollElements
      );
    }
    await page.waitForTimeout(5000);
    let newElementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    if (newElementsLength === elementsLength) break;
  }
}

在这种情况下,卷轴必须是滚动容器中的项目选择器(.jftief)。

您可以从我的博客文章“ nofollow noreferrer”> Web用nodejs刮擦Google Maps评论

You can scroll the reviews page on Google Maps with Puppeteer using one of the solutions below.

Solution #1:

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  while (true) {
    await page.evaluate(`document.querySelector("${scrollContainer}").scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(2000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

Where scrollContainer is an element with scroll (on the reviews page it is .DxyBCb) and page is the Puppeteer's page.

Solution #2:

async function scrollPage(page, scrollElements) {
  let currentElement = 0;
  while (true) {
    let elementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    for (; currentElement < elementsLength; currentElement++) {
      await page.waitForTimeout(200);
      await page.evaluate(
        (currentElement, scrollElements) => {
          document.querySelectorAll(scrollElements)[currentElement].scrollIntoView();
        },
        currentElement,
        scrollElements
      );
    }
    await page.waitForTimeout(5000);
    let newElementsLength = await page.evaluate((scrollElements) => {
      return document.querySelectorAll(scrollElements).length;
    }, scrollElements);
    if (newElementsLength === elementsLength) break;
  }
}

In this case, scrollElements must be the items selector in the scroll container (.jftiEf).

You can read more about scraping Google Maps Reviews from my blog post Web Scraping Google Maps Reviews with Nodejs.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文