pinterest-scraper.js•17 kB
// Pinterest image scraper using puppeteer-core
import fs from 'fs';
import puppeteer from 'puppeteer-core';
// Default configuration constants
const DEFAULT_SEARCH_LIMIT = 10;
const DEFAULT_HEADLESS_MODE = true;
// 检查是否在测试环境中
const isTestEnvironment = process.env.NODE_ENV === 'test' || process.env.JEST_WORKER_ID !== undefined;
// 从环境变量获取代理设置
const PROXY_SERVER = process.env.MCP_PINTEREST_PROXY_SERVER || '';
class PinterestScraper {
constructor() {
this.baseUrl = 'https://www.pinterest.com';
this.searchUrl = `${this.baseUrl}/search/pins/?q=`;
// Default Chrome paths for different platforms
this.chromePaths = {
mac: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
macAlt: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
linux: '/usr/bin/google-chrome',
win: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
winAlt: 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe'
};
}
/**
* 获取当前操作系统的Chrome可执行文件路径
* @returns {string} Chrome可执行文件路径
*/
getChromePath() {
const platform = process.platform;
// 在测试环境中返回一个固定路径
if (isTestEnvironment) {
return '/mock/chrome/path';
}
// 检测操作系统类型
if (platform === 'darwin') {
// macOS
if (fs.existsSync(this.chromePaths.mac)) {
return this.chromePaths.mac;
} else if (fs.existsSync(this.chromePaths.macAlt)) {
return this.chromePaths.macAlt;
}
} else if (platform === 'linux') {
// Linux
if (fs.existsSync(this.chromePaths.linux)) {
return this.chromePaths.linux;
}
} else if (platform === 'win32') {
// Windows
if (fs.existsSync(this.chromePaths.win)) {
return this.chromePaths.win;
} else if (fs.existsSync(this.chromePaths.winAlt)) {
return this.chromePaths.winAlt;
}
}
// 如果找不到Chrome,抛出错误
throw new Error('无法找到Chrome浏览器,请安装Chrome或手动指定可执行文件路径');
}
/**
* Search for Pinterest images
* @param {string} keyword - Search keyword
* @param {number} limit - Result limit
* @param {boolean} headless - Whether to use headless mode
* @param {AbortSignal} signal - AbortController signal for cancelling the operation
* @returns {Promise<Array>} - Search results array
*/
async search(keyword, limit = DEFAULT_SEARCH_LIMIT, headless = DEFAULT_HEADLESS_MODE, signal) {
// Debug log for parameters
// console.error('PinterestScraper.search called with:');
// console.error('- keyword:', keyword);
// console.error('- limit:', limit);
// console.error('- headless:', headless);
// console.error('- signal:', signal ? 'provided' : 'not provided');
let browser = null;
let page = null;
try {
// Support for cancellation
if (signal && signal.aborted) {
// console.error('Search aborted before starting');
throw new Error('操作被取消');
}
// Build search URL
const searchQuery = encodeURIComponent(keyword);
const url = `${this.searchUrl}${searchQuery}`;
// console.error('Search URL:', url);
// Launch browser - using system installed Chrome
try {
// 在测试环境中使用 mock
if (isTestEnvironment) {
// console.error('Test environment detected, using mock browser');
// 在测试环境中,puppeteer-core已经被Jest模拟了,这里简单启动即可
// 不需要提供executablePath,因为模拟版本不会真正启动Chrome
browser = await puppeteer.launch();
} else {
const options = {
executablePath: this.getChromePath(),
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--lang=zh-CN,zh'
]
};
// 如果设置了代理服务器,添加到启动参数中
if (PROXY_SERVER) {
console.log(`使用代理服务器: ${PROXY_SERVER}`);
options.args.push(`--proxy-server=${PROXY_SERVER}`);
}
browser = await puppeteer.launch(options);
}
} catch (err) {
// console.error('Failed to launch browser:', err.message);
return [];
}
// Check for cancellation after browser launch
if (signal && signal.aborted) {
// console.error('Search aborted after browser launch');
await browser.close();
throw new Error('操作被取消');
}
if (!browser) {
// console.error('Browser is null, returning empty results');
return [];
}
// Create new page
try {
page = await browser.newPage();
} catch (err) {
// console.error('Failed to create page:', err.message);
await browser.close();
return [];
}
// Check for cancellation after page creation
if (signal && signal.aborted) {
// console.error('Search aborted after page creation');
await browser.close();
throw new Error('操作被取消');
}
// Set viewport size
await page.setViewport({ width: 1280, height: 800 }).catch(err => {
// console.error('Failed to set viewport:', err.message);
});
// Set user agent
await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36').catch(err => {
// console.error('Failed to set user agent:', err.message);
});
// Set timeouts
page.setDefaultNavigationTimeout(60000);
page.setDefaultTimeout(30000);
// 跟踪添加的事件监听器
const addedEventListeners = new Set();
// Simplify request interception
try {
await page.setRequestInterception(true);
// Handle request interception with cancellation support
const requestHandler = (req) => {
// Check if operation was cancelled
if (signal && signal.aborted) {
req.abort();
return;
}
const resourceType = req.resourceType();
if (resourceType === 'image' || resourceType === 'font' || resourceType === 'media') {
req.abort();
} else {
req.continue();
}
};
page.on('request', requestHandler);
addedEventListeners.add('request');
} catch (err) {
// console.error('Failed to set request interception:', err.message);
}
// Check for cancellation before navigation
if (signal && signal.aborted) {
// console.error('Search aborted before navigation');
await browser.close();
throw new Error('操作被取消');
}
// Navigate to Pinterest search page
try {
await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000 });
} catch (err) {
// console.error('Page navigation failed:', err.message);
await browser.close();
return [];
}
// Check for cancellation after navigation
if (signal && signal.aborted) {
// console.error('Search aborted after navigation');
await browser.close();
throw new Error('操作被取消');
}
// Wait for images to load
try {
await page.waitForSelector('div[data-test-id="pin"]', { timeout: 10000 });
} catch (err) {
// console.log('Pin elements not found, but continuing:', err.message);
}
// Check for cancellation before scrolling
if (signal && signal.aborted) {
// console.error('Search aborted before scrolling');
await browser.close();
throw new Error('操作被取消');
}
// Scroll page to load more content
try {
// Calculate scroll distance based on limit
const scrollDistance = Math.max(limit * 300, 1000);
await this.autoScroll(page, scrollDistance, signal);
} catch (err) {
// If error is from cancellation, propagate it
if (signal && signal.aborted) {
// console.error('Scroll cancelled:', err.message);
await browser.close();
throw new Error('操作被取消');
}
// console.error('Failed to scroll page:', err.message);
}
// Check for cancellation before extracting images
if (signal && signal.aborted) {
// console.error('Search aborted before image extraction');
await browser.close();
throw new Error('操作被取消');
}
// Extract image data
let results = [];
try {
// Extract src attributes from all image elements
results = await page.evaluate(() => {
const images = Array.from(document.querySelectorAll('img'));
return images
.filter(img => img.src && img.src.includes('pinimg.com'))
.map(img => {
let imageUrl = img.src;
// Handle various thumbnail sizes, convert to original size
if (imageUrl.match(/\/\d+x\d*\//)) {
imageUrl = imageUrl.replace(/\/\d+x\d*\//, '/originals/');
}
// Replace specific thumbnail patterns
const thumbnailPatterns = ['/60x60/', '/236x/', '/474x/', '/736x/'];
for (const pattern of thumbnailPatterns) {
if (imageUrl.includes(pattern)) {
imageUrl = imageUrl.replace(pattern, '/originals/');
break;
}
}
return {
title: img.alt || 'Unknown Title',
image_url: imageUrl,
link: img.closest('a') ? img.closest('a').href : imageUrl,
source: 'pinterest'
};
});
}).catch(err => {
// console.error('Failed to extract images:', err.message);
return [];
});
} catch (err) {
// console.error('Error evaluating page:', err.message);
results = [];
}
// Final cancellation check before processing results
if (signal && signal.aborted) {
// console.error('Search aborted before processing results');
await browser.close();
throw new Error('操作被取消');
}
// Ensure results is an array
const validResults = Array.isArray(results) ? results : [];
// Deduplicate and limit results
const uniqueResults = [];
const urlSet = new Set();
for (const item of validResults) {
if (uniqueResults.length >= limit) break;
// Ensure item is valid object with image_url property
if (item && typeof item === 'object' && item.image_url && !urlSet.has(item.image_url)) {
urlSet.add(item.image_url);
uniqueResults.push({
...item,
// Ensure 'source' field is present
source: item.source || 'pinterest'
});
}
}
return uniqueResults;
} catch (error) {
// Check if error is from cancellation
if (signal && signal.aborted || error.message === '操作被取消') {
// console.error('Pinterest search cancelled:', error.message);
throw error; // Propagate cancellation error
}
// console.error('Pinterest search error:', error.message);
return [];
} finally {
// 清理所有事件监听器
if (page) {
try {
page.removeAllListeners();
} catch (e) {
// console.error('Error removing event listeners:', e.message);
}
}
// Close browser
if (browser) {
try {
await browser.close();
} catch (e) {
// console.error('Error closing browser:', e.message);
}
}
}
}
/**
* Auto-scroll page to load more content
* @param {Page} page - Puppeteer page object
* @param {number} maxScrollDistance - Maximum scroll distance
* @param {AbortSignal} signal - AbortController signal
*/
async autoScroll(page, maxScrollDistance = 3000, signal) {
// Check for cancellation before starting scroll
if (signal && signal.aborted) {
throw new Error('操作被取消');
}
await page.evaluate(async (maxScrollDistance) => {
await new Promise((resolve, reject) => {
let totalHeight = 0;
const distance = 100;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
// Stop after scrolling a certain distance
if (totalHeight >= maxScrollDistance) {
clearInterval(timer);
resolve();
}
}, 100);
// Add cleanup function to handle potential cancellation
window.scrollCancelled = () => {
clearInterval(timer);
reject(new Error('操作被取消'));
};
});
}, maxScrollDistance);
// Check for cancellation during scrolling
if (signal && signal.aborted) {
await page.evaluate(() => {
if (window.scrollCancelled) window.scrollCancelled();
});
throw new Error('操作被取消');
}
// Wait for new content to load
const waitTime = signal && signal.aborted ? 0 : 2000;
if (waitTime > 0) {
await new Promise(resolve => setTimeout(resolve, waitTime));
}
// Final cancellation check
if (signal && signal.aborted) {
throw new Error('操作被取消');
}
}
/**
* Download image
* @param {string} imageUrl - Image URL
* @param {string} outputPath - Output path
* @param {AbortSignal} signal - AbortController signal
* @returns {Promise<boolean>} - Success flag
*/
async downloadImage(imageUrl, outputPath, signal) {
try {
// 检查参数
if (!imageUrl || !outputPath) {
// console.error('Image URL or output path is empty');
return false;
}
// 检查取消信号
if (signal && signal.aborted) {
// console.error('Download cancelled before starting');
return false;
}
// console.log('Downloading image:', imageUrl);
// 将缩略图URL转换为原图
imageUrl = this.transformImageUrl(imageUrl);
// 使用fetch API下载图片,支持取消功能
const fetchOptions = signal ? { signal } : undefined;
const response = await fetch(imageUrl, fetchOptions);
if (!response.ok) {
throw new Error(`Download failed, status code: ${response.status}`);
}
// 检查取消信号
if (signal && signal.aborted) {
// console.error('Download cancelled after fetch response');
return false;
}
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
// 检查取消信号
if (signal && signal.aborted) {
// console.error('Download cancelled after buffer download');
return false;
}
// 保存图片 - 确保在测试环境中模拟函数被正确调用
fs.writeFileSync(outputPath, buffer);
// console.log('Image saved to:', outputPath);
return true;
} catch (error) {
// 检查是否为取消操作导致的错误
if (signal && signal.aborted || error.name === 'AbortError') {
// console.error('Download operation was cancelled');
return false;
}
// console.error(`Failed to download image: ${error.message}`);
return false;
}
}
/**
* Transform thumbnail URL to original size
* @param {string} url - Thumbnail URL
* @returns {string} - Original size URL
*/
transformImageUrl(url) {
if (!url) return url;
// Handle various thumbnail sizes, convert to original size
if (url.match(/\/\d+x\d*\//)) {
return url.replace(/\/\d+x\d*\//, '/originals/');
}
// Replace specific thumbnail patterns
const thumbnailPatterns = ['/60x60/', '/236x/', '/474x/', '/736x/'];
for (const pattern of thumbnailPatterns) {
if (url.includes(pattern)) {
return url.replace(pattern, '/originals/');
}
}
return url;
}
}
// Export PinterestScraper class
export { PinterestScraper as default };