Open
Description
Environment
chrome-aws-lambda
Version: chrome-aws-lambda:22puppeteer
/puppeteer-core
Version: 21.5.2- OS: Windows
- Node.js Version: v18.18.0
- Lambda / GCF Runtime: nodejs14.x
Expected Behavior
I currently have a page that needs to be scrolled down to fully load the elements I'm trying to scrape.
When I run my scraping code locally, it scrolls fine and successfully loads all the elements.
Snapshot after local run where all elements are successfully loaded:
Current Behavior
However, when I run this in lambda the scrolling does not work.
The elements are not totally loaded (50 elements that are loaded in the beginning vs the 400+ that get loaded in when I run it locally.
I've tried using different selectors as targets to scroll to but none seem to work.
Snapshot after lambda run after scrolling is called:
Steps to Reproduce
URL: https://sports.bwin.pt/pt/sports/futebol-4/apostar
const AWS = require('aws-sdk')
const s3 = new AWS.S3({apiVersion: '2006-03-01'});
const chromium = require('chrome-aws-lambda');
const pageURL = process.env.TARGET_URL
const agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
const Bwin = require('./scrapers/bwin.js')
const db = require('./db.js')
exports.handler = async (event, context) => {
let result = null;
let browser = null;
try {
browser = await chromium.puppeteer.launch({
args: chromium.args,
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: chromium.headless,
ignoreHTTPSErrors: true,
});
await db.connectToDb()
let page = await browser.newPage();
await page.setUserAgent(agent)
console.log('Navigating to page: ', pageURL)
await page.goto(pageURL, { waitUntil: 'networkidle2'})
let bwin = new Bwin()
let events = await bwin.scrapeEvents(page)
console.log('length: ', events.length)
const buffer = await page.screenshot()
// upload the image using the current timestamp as filename
const s3result = await s3
.upload({
Bucket: 'mybucket',
Key: `${Date.now()}-${events.length}.png`,
Body: buffer,
ContentType: 'image/png',
ACL: 'public-read'
})
.promise()
console.log('S3 image URL:', s3result.Location)
console.log('URL: ', page.url())
await page.close();
await browser.close();
} catch (error) {
console.log(error)
} finally {
if (browser !== null) {
await browser.close();
}
}
return result
}
const db = require('../db.js')
const AWS = require('aws-sdk')
const s3 = new AWS.S3({apiVersion: '2006-03-01'});
class Scraper {
async scrapeEvents(page) {
await this.loadPage(page)
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
await this.loadAllElements(page, this.eventSelector)
let events = await page.$$eval(this.eventSelector, this.getEventInfo)
events = events.map(event => this.parseEventInfo(event))
return events
}
async loadPage(page) {
await this.closePopUp(page);
await this.loadElements(page, this.eventSelector, 0)
}
async loadElements(page, elementSelector, elementCount) {
console.log('count: ', elementCount)
const buffer = await page.screenshot()
// upload the image using the current timestamp as filename
const s3result = await s3
.upload({
Bucket: 'mybucket',
Key: `${Date.now()}-elementCount:${elementCount}.png`,
Body: buffer,
ContentType: 'image/png',
ACL: 'public-read'
})
.promise()
console.log('S3 image URL:', s3result.Location)
try {
await page.waitForFunction((elementSelector, elementCount) => {
return document.querySelectorAll(elementSelector).length != elementCount;
}, { timeout: 30000 }, elementSelector, elementCount);
} catch (error) {
throw error
}
}
async loadAllElements(page, elementSelector) {
try {
while(true) {
let elementCount = await page.evaluate(this.scrollToBottom, this.scrollableSelector, elementSelector)
await this.loadElements(page, elementSelector, elementCount)
}
} catch(error) {
console.error(error)
}
}
async closePopUp(page) {
await this.loadElements(page, this.popupSelector, 0)
await page.evaluate((sel) => document.querySelector(sel).click(), this.popupSelector)
console.log('popup closed')
}
scrollToBottom(scrollableSelector, elementSelector) {
let elementCount = document.querySelectorAll(elementSelector).length;
document.querySelector(scrollableSelector).scrollIntoView({ behavior:"smooth", block: "end" })
console.log('SCROLLED TO BOTTOM')
return elementCount;
}
}
module.exports = Scraper;
const Scraper = require('./scraper.js')
class Bwin extends Scraper {
eventSelector = 'ms-event'
popupSelector = 'button#onetrust-accept-btn-handler'
// scrollableSelector = 'div#main-view'
scrollableSelector = 'div.grid-footer'
async getEventInfo(events) {
return events.map(event => {
return {
home: event.querySelector('div.participant-wrapper:nth-child(1)').textContent.trim(),
away: event.querySelector('div.participant-wrapper:nth-child(2)').textContent.trim(),
date: event.querySelector('ms-event-timer').textContent.trim(),
competition: event.closest('ms-event-group').querySelector('ms-league-header').textContent.trim()
}
})
}
parseEventInfo(event) {
let home = event.home
let away = event.away
let [country, competition] = this.parseCompetition(event.competition)
let date = this.parseDate(event.date)
return {
home,
away,
date,
country,
competition
}
}
parseCompetition(competition) {
let country
[country, competition] = competition.split('|').map(str => str.trim())
return [country, competition]
}
parseDate(dateStr) {
try {
let date = new Date()
if (dateStr.includes('Hoje') || dateStr.includes('Amanhã')) {
let [hour, minute] = dateStr.split('/')[1].trim().split(':').map(t => Number(t))
date.setHours(hour, minute, 0, 0)
if (dateStr.includes('Amanhã')) date.setDate(date.getDate() + 1)
} else {
let [datePart, timeStr] = dateStr.split(' ')
let [day, month, year] = datePart.split('/').map(t => Number(t))
let [hour, minute] = timeStr.trim().split(':').map(t => Number(t))
date = new Date(year, --month, day, hour, minute, 0, 0)
}
return date
} catch (error) {
console.log(error)
}
}
}
module.exports = Bwin;
Has anyone experienced this?