Skip to main content
Glama

Financial News and Notes MCP Server

MIT License
129
199
  • Apple
  • Linux
baiduNews.js6.35 kB
// 百度新闻搜索爬虫 export async function searchBaiduNews(keywords) { try { // 将所有关键词用空格连接,支持多关键词搜索 const searchQuery = keywords.join(' '); console.log(`正在搜索百度新闻关键词: ${searchQuery}`); // 百度新闻搜索URL,使用word参数传递搜索关键词 const encodedQuery = encodeURIComponent(searchQuery); const baiduUrl = `https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&ie=utf-8&word=${encodedQuery}`; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 15000); const response = await fetch(baiduUrl, { method: 'GET', headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://www.baidu.com/' }, signal: controller.signal }); clearTimeout(timeoutId); if (!response.ok) { throw new Error(`百度新闻请求失败: ${response.status}`); } const html = await response.text(); console.log(`百度新闻页面HTML长度: ${html.length}`); // 解析百度新闻页面内容 const newsItems = parseBaiduNews(html, searchQuery); console.log(`百度新闻解析完成,共获得 ${newsItems.length} 条新闻`); return newsItems; } catch (error) { if (error instanceof Error && error.name === 'AbortError') { console.error('百度新闻搜索超时'); } else { console.error('百度新闻搜索出错:', error); } return []; } } // 解析百度新闻页面内容 function parseBaiduNews(html, searchQuery) { const newsItems = []; try { // 策略: 查找所有新闻结果的容器区块(div.result),然后逐个解析 const newsBlockRegex = /<div[^>]*class="[^"]*\bresult\b[^"]*"[^>]*>(.*?)<\/div>/gs; const blockMatches = html.match(newsBlockRegex); if (blockMatches) { console.log(`找到 ${blockMatches.length} 个新闻区块`); for (const blockHtml of blockMatches) { const newsItem = extractNewsFromBaiduItem(blockHtml, searchQuery); if (newsItem && newsItems.length < 15) { // 检查重复,避免添加相同的文章 if (!newsItems.some(item => item.title === newsItem.title)) { newsItems.push(newsItem); } } } } // 如果主策略未找到任何新闻,则启用备用策略 if (newsItems.length === 0) { console.log("主策略未找到新闻,启用备用策略..."); // 备用策略: 查找包含新闻标题的h3标签 (旧方法) const titleRegex = /<h3[^>]*class="[^"]*t"[^>]*><a[^>]*href="([^"]*)"[^>]*>([^<]*(?:<[^>]*>[^<]*)*)<\/a><\/h3>/g; let titleMatch; while ((titleMatch = titleRegex.exec(html)) !== null && newsItems.length < 15) { const url = titleMatch[1]; const title = titleMatch[2].replace(/<[^>]*>/g, '').trim(); if (title && url && containsKeywords(title, searchQuery)) { newsItems.push({ title: title, summary: title, url: url, source: '百度新闻', publishTime: '未知时间', // 备用策略无法保证时间匹配 keywords: searchQuery.split(' ').filter(k => k.trim().length > 0) }); } } } } catch (error) { console.error('百度新闻页面解析出错:', error); } return newsItems; } // 从单个百度新闻区块中提取完整信息 function extractNewsFromBaiduItem(itemHtml, searchQuery) { try { // 标题和链接通常在 h3 > a 标签中 const titleMatch = itemHtml.match(/<h3[^>]*><a[^>]*href="([^"]*)"[^>]*>([^<]*(?:<[^>]*>[^<]*)*)<\/a><\/h3>/); // 摘要信息 const summaryMatch = itemHtml.match(/<div[^>]*class="[^"]*c-abstract[^"]*"[^>]*>([^<]+(?:<br\s*\/?>[^<]+)*)<\/div>/); // 时间信息 const timeMatch = itemHtml.match(/<span[^>]*class="[^"]*c-color-gray2[^"]*"[^>]*aria-label="发布于:([^"]*)"[^>]*>([^<]*)<\/span>/); if (titleMatch && titleMatch[1] && titleMatch[2]) { const url = titleMatch[1]; const title = titleMatch[2].replace(/<[^>]*>/g, '').trim(); // 摘要是可选的,默认为标题 let summary = title; if (summaryMatch && summaryMatch[1]) { summary = summaryMatch[1].replace(/<br\s*\/?>/g, ' ').replace(/<[^>]*>/g, '').trim(); } // 时间也是可选的 const time = timeMatch ? timeMatch[2].trim() : ''; // 必须有标题和链接,且包含关键词 if (title && url && containsKeywords(title + summary, searchQuery)) { return { title, summary, url: url, source: '百度新闻', publishTime: time || '未知时间', keywords: searchQuery.split(' ').filter(k => k.trim().length > 0) }; } } } catch (error) { console.error('解析百度新闻项出错:', error); } return null; } // 检查内容是否包含关键词 function containsKeywords(content, searchQuery) { const keywords = searchQuery.split(' ').filter(k => k.trim().length > 0); const lowerContent = content.toLowerCase(); // OR逻辑:只要包含任意一个关键词即可 return keywords.some(keyword => lowerContent.includes(keyword.toLowerCase())); }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/guangxiangdebizi/my-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server