Web scraping and browser automation powered by Puppeteer.
✅ USE this skill when:
❌ DON'T use this skill when:
web_fetch instead# Install Puppeteer
npm install puppeteer
# Basic scraping
node scripts/scrape.js https://example.com
const puppeteer = require('puppeteer');
async function scrape(url) {
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// ... extract data ...
await browser.close();
}
// Get all text from a selector
const titles = await page.$$eval('h2', els => els.map(el => el.textContent.trim()));
// Get text from single element
const price = await page.$eval('.price', el => el.textContent.trim());
const html = await page.$eval('.product-list', el => el.innerHTML);
const links = await page.$$eval('a', els => els.map(el => ({
text: el.textContent.trim(),
href: el.getAttribute('href')
})));
// Wait for selector
await page.waitForSelector('.results', { timeout: 10000 });
// Wait for network idle
await page.goto(url, { waitUntil: 'networkidle2' });
// Wait for function
await page.waitForFunction(() => document.querySelectorAll('.item').length > 10);
async function scrapeWithPagination(baseUrl, maxPages = 5) {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
let results = [];
for (let i = 1; i <= maxPages; i++) {
const url = `${baseUrl}?page=${i}`;
await page.goto(url, { waitUntil: 'networkidle2' });
const items = await page.$$eval('.item', els =>
els.map(el => el.textContent.trim())
);
if (items.length === 0) break;
results.push(...items);
}
await browser.close();
return results;
}
// Full page screenshot
await page.screenshot({ path: 'screenshot.png', fullPage: true });
// Element screenshot
const element = await page.$('.chart');
await element.screenshot({ path: 'chart.png' });
await page.setRequestInterception(true);
page.on('request', req => {
if (['image', 'stylesheet', 'font'].includes(req.resourceType())) {
req.abort();
} else {
req.continue();
}
});
// Usage: node scripts/scrape.js <url> [selector]
const puppeteer = require('puppeteer');
const url = process.argv[2];
const selector = process.argv[3] || 'body';
if (!url) {
console.error('Usage: node scrape.js <url> [selector]');
process.exit(1);
}
(async () => {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
const content = await page.$$eval(selector, els =>
els.map(el => el.textContent.trim())
);
console.log(JSON.stringify(content, null, 2));
await browser.close();
})();
// Usage: node scripts/screenshot.js <url> [output.png]
const puppeteer = require('puppeteer');
const url = process.argv[2];
const output = process.argv[3] || 'screenshot.png';
if (!url) {
console.error('Usage: node screenshot.js <url> [output.png]');
process.exit(1);
}
(async () => {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
await page.screenshot({ path: output, fullPage: true });
console.log(`Screenshot saved to ${output}`);
await browser.close();
})();
// Usage: node crawl.js <url> <selector> [maxPages]
const puppeteer = require('puppeteer');
const url = process.argv[2];
const selector = process.argv[3];
const maxPages = parseInt(process.argv[4]) || 10;
if (!url || !selector) {
console.error('Usage: node crawl.js <url> <selector> [maxPages]');
process.exit(1);
}
(async () => {
const browser = await puppeteer.launch({ headless: 'new' });
const page = await browser.newPage();
let allData = [];
for (let i = 1; i <= maxPages; i++) {
const pageUrl = url.includes('?') ? `${url}&page=${i}` : `${url}?page=${i}`;
console.error(`Crawling: ${pageUrl}`);
await page.goto(pageUrl, { waitUntil: 'networkidle2' });
const data = await page.$$eval(selector, els =>
els.map(el => el.textContent.trim())
);
if (data.length === 0) break;
allData.push(...data);
}
console.log(JSON.stringify(allData, null, 2));
await browser.close();
})();
| Target | Selector |
|---|---|
| -------- | ---------- |
| All links | a |
| All images | img |
| Headings | h1, h2, h3 |
| Lists | ul li, ol li |
| Tables | table tr |
| Cards/Items | .item, .card, .product |
| Prices | .price, [class*="price"] |
| Descriptions | .description, .summary |
curl example.com/robots.txtawait new Promise(r => setTimeout(r, 2000))networkidle2 for SPAs (Single Page Apps)```javascript
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
```
For detailed Puppeteer API, see puppeteer/docs/api.md.
共 1 个版本