/**
* Search namespace implementation
*/
import axios from 'axios';
import * as cheerio from 'cheerio';
import { MCPServer } from '../core/server.js';
import { MCPTool } from '../types/core.js';
import {
SearchEngine,
HttpMethod,
WebSearchResponse,
FetchResponse,
ExtractTarget,
ExtractRules,
ExtractResponse,
CrawlResponse
} from '../types/search.js';
import { InvalidArgError, ConfigMissingError } from '../core/errors.js';
export class SearchNamespace {
private mcpServer: MCPServer;
constructor(mcpServer: MCPServer) {
this.mcpServer = mcpServer;
this.registerTools();
}
private registerTools(): void {
const registry = this.mcpServer.getRegistry();
registry.registerTool(
'search.web',
{
name: 'search.web',
description: 'Search the web using various search engines',
inputSchema: {
type: 'object',
properties: {
query: { type: 'string' },
engine: { type: 'string', enum: ['bing', 'google', 'brave', 'ddg'] },
num: { type: 'number', minimum: 1, maximum: 50 }
},
required: ['query']
}
},
this.webSearch.bind(this)
);
registry.registerTool(
'search.fetch',
{
name: 'search.fetch',
description: 'Fetch a URL with HTTP request options',
inputSchema: {
type: 'object',
properties: {
url: { type: 'string' },
method: { type: 'string', enum: ['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'HEAD', 'OPTIONS'] },
headers: { type: 'object' },
body: { type: 'string' },
follow: { type: 'boolean' }
},
required: ['url']
}
},
this.fetch.bind(this)
);
registry.registerTool(
'search.extract',
{
name: 'search.extract',
description: 'Extract structured data from HTML content',
inputSchema: {
type: 'object',
properties: {
target: {
oneOf: [
{
type: 'object',
properties: { url: { type: 'string' } },
required: ['url']
},
{
type: 'object',
properties: { body_text: { type: 'string' } },
required: ['body_text']
}
]
},
rules: {
type: 'object',
properties: {
css: { type: 'array', items: { type: 'string' } },
xpath: { type: 'array', items: { type: 'string' } },
boilerplate: { type: 'boolean' }
}
}
},
required: ['target']
}
},
this.extract.bind(this)
);
registry.registerTool(
'search.crawl',
{
name: 'search.crawl',
description: 'Crawl multiple web pages starting from seed URLs',
inputSchema: {
type: 'object',
properties: {
seed_urls: { type: 'array', items: { type: 'string' } },
limit: { type: 'number', minimum: 1, maximum: 1000 },
same_origin: { type: 'boolean' },
include: { type: 'array', items: { type: 'string' } },
exclude: { type: 'array', items: { type: 'string' } },
cursor: { type: 'string' }
},
required: ['seed_urls']
}
},
this.crawl.bind(this)
);
}
private async webSearch(params: {
query: string;
engine?: SearchEngine;
num?: number;
}): Promise<WebSearchResponse> {
const { query, engine = 'ddg', num = 10 } = params;
switch (engine) {
case 'ddg':
return await this.searchDuckDuckGo(query, num);
case 'brave':
return await this.searchBrave(query, num);
case 'bing':
return await this.searchBing(query, num);
case 'google':
return await this.searchGoogle(query, num);
default:
throw new InvalidArgError('engine', `Unsupported search engine: ${engine}`);
}
}
private async searchDuckDuckGo(query: string, num: number): Promise<WebSearchResponse> {
try {
// DuckDuckGo HTML search (free tier)
const response = await axios.get('https://html.duckduckgo.com/html/', {
params: {
q: query,
t: 'h_',
ia: 'web'
},
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
},
timeout: 10000
});
const $ = cheerio.load(response.data);
const results: any[] = [];
$('.result').each((i, element) => {
if (i >= num) return false;
const $el = $(element);
const titleEl = $el.find('.result__title a');
const title = titleEl.text().trim();
const url = titleEl.attr('href');
const snippet = $el.find('.result__snippet').text().trim();
if (title && url) {
results.push({
rank: i + 1,
title,
url: url.startsWith('/') ? `https://duckduckgo.com${url}` : url,
snippet: snippet || undefined
});
}
});
return {
results,
raw: { source: 'duckduckgo', total_results: results.length }
};
} catch (error) {
throw new Error(`DuckDuckGo search failed: ${error instanceof Error ? error.message : error}`);
}
}
private async searchBrave(query: string, num: number): Promise<WebSearchResponse> {
// Brave Search API would require an API key
throw new ConfigMissingError('BRAVE_SEARCH_API_KEY');
}
private async searchBing(query: string, num: number): Promise<WebSearchResponse> {
// Bing Search API would require Azure Cognitive Services key
throw new ConfigMissingError('BING_SEARCH_API_KEY');
}
private async searchGoogle(query: string, num: number): Promise<WebSearchResponse> {
// Google Custom Search API would require API key and Search Engine ID
throw new ConfigMissingError('GOOGLE_SEARCH_API_KEY');
}
private async fetch(params: {
url: string;
method?: HttpMethod;
headers?: Record<string, string>;
body?: string;
follow?: boolean;
}): Promise<FetchResponse> {
const { url, method = 'GET', headers = {}, body, follow = true } = params;
try {
const response = await axios({
url,
method: method.toLowerCase() as any,
headers: {
'User-Agent': 'mcp-fullstack/1.0',
...headers
},
data: body,
maxRedirects: follow ? 5 : 0,
timeout: 30000,
validateStatus: () => true, // Don't throw on HTTP errors
responseType: 'arraybuffer'
});
const isText = this.isTextContent(response.headers['content-type'] || '');
const size = response.data.byteLength;
let body_text: string | undefined;
let json: any;
let binary = false;
if (isText && size < 10 * 1024 * 1024) { // Max 10MB for text content
body_text = Buffer.from(response.data).toString('utf-8');
// Try to parse as JSON
if (response.headers['content-type']?.includes('application/json')) {
try {
json = JSON.parse(body_text);
} catch {
// Not valid JSON, keep as text
}
}
} else {
binary = true;
}
return {
status: response.status,
headers: response.headers as Record<string, string>,
body_text,
json,
binary,
size
};
} catch (error) {
if (axios.isAxiosError(error)) {
throw new Error(`HTTP request failed: ${error.message}`);
}
throw error;
}
}
private isTextContent(contentType: string): boolean {
return contentType.startsWith('text/') ||
contentType.includes('application/json') ||
contentType.includes('application/xml') ||
contentType.includes('application/javascript') ||
contentType.includes('application/x-www-form-urlencoded');
}
private async extract(params: {
target: ExtractTarget;
rules?: ExtractRules;
}): Promise<ExtractResponse> {
const { target, rules = {} } = params;
let html: string;
if ('url' in target) {
const fetchResponse = await this.fetch({ url: target.url! });
if (!fetchResponse.body_text) {
throw new Error('Failed to fetch text content from URL');
}
html = fetchResponse.body_text;
} else {
html = target.body_text!;
}
const $ = cheerio.load(html);
const result: ExtractResponse = {};
// Apply boilerplate removal if requested
if (rules.boilerplate) {
// Remove common boilerplate elements
$('script, style, nav, header, footer, aside, .advertisement, .ads').remove();
}
// Extract using CSS selectors
if (rules.css) {
const fields: Record<string, any> = {};
for (const [index, selector] of rules.css.entries()) {
const elements = $(selector);
if (elements.length === 1) {
fields[`css_${index}`] = elements.text().trim();
} else if (elements.length > 1) {
fields[`css_${index}`] = elements.map((i, el) => $(el).text().trim()).get();
}
}
if (Object.keys(fields).length > 0) {
result.fields = fields;
}
}
// Extract plain text content
const textContent = $('body').text().replace(/\s+/g, ' ').trim();
if (textContent) {
result.text = textContent;
}
// Extract all links
const links: Array<{ url: string; title?: string }> = [];
$('a[href]').each((i, el) => {
const $el = $(el);
const href = $el.attr('href');
const title = $el.text().trim() || $el.attr('title');
if (href) {
let url = href;
// Convert relative URLs to absolute
if ('url' in target && !href.startsWith('http')) {
try {
const baseUrl = new URL(target.url!);
url = new URL(href, baseUrl.origin).toString();
} catch {
// Skip invalid URLs
return;
}
}
links.push({ url, title });
}
});
if (links.length > 0) {
result.links = links;
}
return result;
}
private async crawl(params: {
seed_urls: string[];
limit?: number;
same_origin?: boolean;
include?: string[];
exclude?: string[];
cursor?: string;
}): Promise<CrawlResponse> {
const {
seed_urls,
limit = 50,
same_origin = true,
include = [],
exclude = []
} = params;
const visited = new Set<string>();
const queue = [...seed_urls];
const pages = [];
const origins = same_origin ? new Set(seed_urls.map(url => new URL(url).origin)) : null;
while (queue.length > 0 && pages.length < limit) {
const url = queue.shift()!;
if (visited.has(url)) continue;
visited.add(url);
// Check include/exclude patterns
if (include.length > 0 && !include.some(pattern => url.includes(pattern))) {
continue;
}
if (exclude.length > 0 && exclude.some(pattern => url.includes(pattern))) {
continue;
}
try {
const fetchResponse = await this.fetch({ url });
const $ = cheerio.load(fetchResponse.body_text || '');
const title = $('title').text().trim();
const textContent = $('body').text().replace(/\s+/g, ' ').trim();
const excerpt = textContent.substring(0, 200) + (textContent.length > 200 ? '...' : '');
pages.push({
url,
status: fetchResponse.status,
title: title || undefined,
text_excerpt: excerpt || undefined
});
// Add new URLs to queue if we haven't hit the limit
if (pages.length < limit) {
$('a[href]').each((i, el) => {
const href = $(el).attr('href');
if (!href) return;
let newUrl: string;
try {
newUrl = href.startsWith('http') ? href : new URL(href, url).toString();
} catch {
return; // Skip invalid URLs
}
// Check same origin constraint
if (origins && !origins.has(new URL(newUrl).origin)) {
return;
}
if (!visited.has(newUrl) && !queue.includes(newUrl)) {
queue.push(newUrl);
}
});
}
} catch (error) {
console.warn(`Failed to crawl ${url}:`, error);
pages.push({
url,
status: 0,
text_excerpt: `Error: ${error instanceof Error ? error.message : error}`
});
}
}
return {
pages,
next_cursor: queue.length > 0 ? Buffer.from(JSON.stringify(queue.slice(0, 10))).toString('base64') : undefined
};
}
}