/**
* Web Tools — web_fetch and web_search implementations
*
* Extracted from local-tools.ts for single-responsibility.
*/
import { lookup } from "node:dns/promises";
import { getValidToken, createAuthenticatedClient } from "../auth-service.js";
import { resolveConfig } from "../config-store.js";
import { ToolResult } from "../../../shared/types.js";
// ============================================================================
// SSRF PROTECTION
// ============================================================================
export function isBlockedUrl(urlStr: string): boolean {
try {
const u = new URL(urlStr);
const host = u.hostname.toLowerCase();
// Block localhost
if (host === "localhost" || host === "127.0.0.1" || host === "::1" || host === "[::1]" || host === "0.0.0.0") return true;
// Block private IPv4 ranges
if (/^10\./.test(host)) return true;
if (/^172\.(1[6-9]|2\d|3[01])\./.test(host)) return true;
if (/^192\.168\./.test(host)) return true;
// Block IPv6 private
if (/^fe80:/i.test(host) || /^\[fe80:/i.test(host)) return true;
if (/^fc00:/i.test(host) || /^\[fc00:/i.test(host)) return true;
if (/^fd/i.test(host) || /^\[fd/i.test(host)) return true;
// Block cloud metadata
if (host === "169.254.169.254" || /^169\.254\./.test(host)) return true;
// Block IPv6-mapped IPv4 (e.g. ::ffff:127.0.0.1, ::ffff:169.254.169.254)
if (host.includes("::ffff:")) return true;
// Block decimal/hex IP representations (e.g. 2130706433 = 127.0.0.1, 0x7f000001)
if (/^\d+$/.test(host) || /^0x[0-9a-f]+$/i.test(host)) return true;
// Block octal IPs (e.g., 0177.0.0.1 = 127.0.0.1)
if (/^0\d+\./.test(host)) return true;
// Block short zero address forms (e.g. 0, 0.0, 0.0.0)
if (/^0(\.0)*$/.test(host)) return true;
// Block internal TLDs
if (host.endsWith(".internal") || host.endsWith(".local")) return true;
// Block non-HTTP(S)
if (u.protocol !== "http:" && u.protocol !== "https:") return true;
return false;
} catch {
return true;
}
}
async function resolveAndCheckUrl(url: string): Promise<boolean> {
if (isBlockedUrl(url)) return true;
try {
const { hostname } = new URL(url);
// Skip IP-based hostnames (already checked by isBlockedUrl)
if (/^[\d.]+$/.test(hostname) || hostname.includes(":")) return false;
const { address } = await lookup(hostname);
// IPv6 private/loopback detection
if (address.includes(":")) {
const lower = address.toLowerCase();
if (lower === "::1" || lower === "::" || lower.startsWith("fe80") ||
lower.startsWith("fc") || lower.startsWith("fd") ||
lower.startsWith("::ffff:")) return true;
return false;
}
// IPv4 private/internal ranges
const parts = address.split(".").map(Number);
if (parts[0] === 10) return true; // 10.0.0.0/8
if (parts[0] === 172 && parts[1] >= 16 && parts[1] <= 31) return true; // 172.16.0.0/12
if (parts[0] === 192 && parts[1] === 168) return true; // 192.168.0.0/16
if (parts[0] === 127) return true; // 127.0.0.0/8
if (parts[0] === 169 && parts[1] === 254) return true; // link-local
if (address === "0.0.0.0") return true;
return false;
} catch {
return false; // DNS resolution failed, allow (will fail at fetch anyway)
}
}
// ============================================================================
// WEB FETCH
// ============================================================================
export async function webFetch(input: Record<string, unknown>): Promise<ToolResult> {
const url = input.url as string;
if (!url) return { success: false, output: "url is required" };
if (await resolveAndCheckUrl(url)) {
return { success: false, output: "URL blocked: cannot fetch localhost, private IPs, or internal addresses" };
}
try {
const controller = new AbortController();
const fetchTimer = setTimeout(() => controller.abort(), 30000); // 30s timeout
let response = await fetch(url, {
headers: {
"User-Agent": "WhaleCode/3.0 (CLI Agent)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
signal: controller.signal,
redirect: "manual",
});
clearTimeout(fetchTimer);
// Handle redirects manually to prevent SSRF via redirect to internal addresses
if (response.status >= 300 && response.status < 400) {
const location = response.headers.get("location");
if (!location) {
return { success: false, output: `Redirect ${response.status} with no Location header` };
}
// Resolve relative redirects against the original URL
const resolvedLocation = new URL(location, url).toString();
if (await resolveAndCheckUrl(resolvedLocation)) {
return { success: false, output: "Redirect target is a blocked address" };
}
const redirectController = new AbortController();
const redirectTimer = setTimeout(() => redirectController.abort(), 30000);
response = await fetch(resolvedLocation, {
headers: {
"User-Agent": "WhaleCode/3.0 (CLI Agent)",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
signal: redirectController.signal,
redirect: "manual",
});
clearTimeout(redirectTimer);
// If still redirecting, give up rather than following an unbounded chain
if (response.status >= 300 && response.status < 400) {
return { success: false, output: "Too many redirects" };
}
}
if (!response.ok) {
return { success: false, output: `HTTP ${response.status}: ${response.statusText}` };
}
const contentType = response.headers.get("content-type") || "";
const body = await response.text();
// JSON — return as-is (pretty-printed)
if (contentType.includes("application/json")) {
try {
const parsed = JSON.parse(body);
const pretty = JSON.stringify(parsed, null, 2);
return { success: true, output: pretty.slice(0, 80000) };
} catch {
return { success: true, output: body.slice(0, 80000) };
}
}
// Plain text — return as-is
if (contentType.includes("text/plain")) {
return { success: true, output: body.slice(0, 80000) };
}
// HTML — convert to readable text
const text = htmlToText(body);
const truncated = text.length > 50000 ? text.slice(0, 50000) + "\n\n... (truncated)" : text;
return { success: true, output: `# ${url}\n\n${truncated}` };
} catch (err: any) {
return { success: false, output: `Fetch error: ${err.message || err}` };
}
}
// ============================================================================
// HTML TO TEXT
// ============================================================================
/** Remove all instances of a tag and its content, handling nesting (innermost first) */
function removeNestedTag(html: string, tag: string): string {
const pattern = new RegExp(
`<${tag}[^>]*>(?:(?!<${tag}[\\s>/])[\\s\\S])*?<\\/${tag}>`, "gi"
);
let result = html;
let prev = "";
let safety = 0;
while (result !== prev && safety++ < 50) {
prev = result;
result = result.replace(pattern, "");
}
result = result.replace(new RegExp(`<${tag}[^>]*>`, "gi"), "");
return result;
}
function stripTags(html: string): string {
return html.replace(/<[^>]+>/g, "");
}
function decodeEntities(text: string): string {
return text
.replace(/ /g, " ")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/'/g, "'")
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code)));
}
/** Enhanced HTML -> readable text/markdown converter */
export function htmlToText(html: string): string {
let c = html;
// 1. Extract main content area (skips nav, sidebar, footer automatically)
const mainMatch = c.match(/<main[^>]*>([\s\S]*)<\/main>/i)
|| c.match(/<article[^>]*>([\s\S]*)<\/article>/i);
if (mainMatch) {
c = mainMatch[1];
} else {
const bodyMatch = c.match(/<body[^>]*>([\s\S]*)<\/body>/i);
if (bodyMatch) c = bodyMatch[1];
}
// 2. Remove non-content elements (nesting-aware)
for (const tag of [
"script", "style", "nav", "footer", "aside", "header",
"form", "svg", "iframe", "select", "button", "noscript",
]) {
c = removeNestedTag(c, tag);
}
// 3. Remove HTML comments
c = c.replace(/<!--[\s\S]*?-->/g, "");
// 4. Convert semantic elements -> markdown
// Code blocks first (preserve contents)
c = c.replace(/<pre[^>]*>([\s\S]*?)<\/pre>/gi, (_, inner) =>
"\n```\n" + stripTags(inner).trim() + "\n```\n"
);
c = c.replace(/<code[^>]*>([\s\S]*?)<\/code>/gi, "`$1`");
// Headings
c = c.replace(/<h([1-6])[^>]*>([\s\S]*?)<\/h\1>/gi, (_, level, text) =>
"\n" + "#".repeat(parseInt(level)) + " " + stripTags(text).trim() + "\n\n"
);
// Links — skip empty, anchor-only, and javascript: hrefs
c = c.replace(/<a[^>]*href="([^"]*)"[^>]*>([\s\S]*?)<\/a>/gi, (_, href, text) => {
const linkText = stripTags(text).trim();
if (!linkText) return "";
if (href.startsWith("#") || href.startsWith("javascript:")) return linkText;
return `[${linkText}](${href})`;
});
// Tables -> pipe-delimited markdown
c = c.replace(/<table[^>]*>([\s\S]*?)<\/table>/gi, (_, tableContent) => {
const rows: string[] = [];
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
let rowMatch;
let isFirstRow = true;
while ((rowMatch = rowRegex.exec(tableContent)) !== null) {
const cells: string[] = [];
const cellRegex = /<(?:td|th)[^>]*>([\s\S]*?)<\/(?:td|th)>/gi;
let cellMatch;
while ((cellMatch = cellRegex.exec(rowMatch[1])) !== null) {
cells.push(stripTags(cellMatch[1]).trim());
}
if (cells.length > 0) {
rows.push("| " + cells.join(" | ") + " |");
if (isFirstRow) {
rows.push("| " + cells.map(() => "---").join(" | ") + " |");
isFirstRow = false;
}
}
}
return "\n" + rows.join("\n") + "\n";
});
// List items
c = c.replace(/<li[^>]*>([\s\S]*?)<\/li>/gi, (_, text) =>
"- " + stripTags(text).trim() + "\n"
);
// Bold, italic
c = c.replace(/<(strong|b)[^>]*>([\s\S]*?)<\/\1>/gi, "**$2**");
c = c.replace(/<(em|i)[^>]*>([\s\S]*?)<\/\1>/gi, "*$2*");
// Images -> alt text
c = c.replace(/<img[^>]*alt="([^"]*)"[^>]*>/gi, "[$1]");
c = c.replace(/<img[^>]*>/gi, "");
// Horizontal rules
c = c.replace(/<hr\s*\/?>/gi, "\n---\n");
// 5. Block elements -> newlines (replace tags independently, NOT as pairs)
c = c.replace(/<\/(?:p|div|section|article|main|blockquote|dd|dt|figcaption|figure|details|summary)>/gi, "\n\n");
c = c.replace(/<(?:p|div|section|article|main|blockquote|dd|dt|figcaption|figure|details|summary)[^>]*>/gi, "");
c = c.replace(/<br\s*\/?>/gi, "\n");
c = c.replace(/<\/(?:li|tr|thead|tbody|tfoot|ul|ol|dl)>/gi, "\n");
// 6. Strip all remaining tags
c = c.replace(/<[^>]+>/g, "");
// 7. Decode HTML entities
c = decodeEntities(c);
// 8. Clean whitespace
c = c
.replace(/[ \t]+/g, " ")
.replace(/ *\n */g, "\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
return c;
}
// ============================================================================
// WEB SEARCH (Exa API)
// ============================================================================
// Cache the Exa API key within a session
let cachedExaKey: string | null = null;
async function getExaApiKey(): Promise<string | null> {
if (cachedExaKey) return cachedExaKey;
try {
const config = resolveConfig();
// Tier 1: Service role key (MCP server mode)
if (config.supabaseUrl && config.supabaseKey) {
const { createClient } = await import("@supabase/supabase-js");
const client = createClient(config.supabaseUrl, config.supabaseKey, {
auth: { persistSession: false, autoRefreshToken: false },
});
const { data } = await client.from("platform_secrets").select("value").eq("key", "exa_api_key").single();
if (data?.value) { cachedExaKey = data.value; return cachedExaKey; }
}
// Tier 2: User JWT
const token = await getValidToken();
if (token) {
const client = createAuthenticatedClient(token);
const { data } = await client.from("platform_secrets").select("value").eq("key", "exa_api_key").single();
if (data?.value) { cachedExaKey = data.value; return cachedExaKey; }
}
} catch { /* swallow */ }
return null;
}
export async function webSearch(input: Record<string, unknown>): Promise<ToolResult> {
const query = input.query as string;
if (!query) return { success: false, output: "query is required" };
const allowedDomains = input.allowed_domains as string[] | undefined;
const blockedDomains = input.blocked_domains as string[] | undefined;
const apiKey = await getExaApiKey();
if (!apiKey) {
return { success: false, output: "Exa API key not configured. Add 'exa_api_key' to platform_secrets table." };
}
try {
const searchBody: Record<string, unknown> = {
query,
numResults: 10,
type: "auto",
contents: { text: { maxCharacters: 1200, includeHtmlTags: false } },
};
if (allowedDomains?.length) searchBody.includeDomains = allowedDomains;
if (blockedDomains?.length) searchBody.excludeDomains = blockedDomains;
const response = await fetch("https://api.exa.ai/search", {
method: "POST",
headers: {
"x-api-key": apiKey,
"Content-Type": "application/json",
"Accept": "application/json",
},
body: JSON.stringify(searchBody),
signal: AbortSignal.timeout(15000),
});
if (!response.ok) {
const errBody = await response.text();
return { success: false, output: `Exa API error (${response.status}): ${errBody}` };
}
const data = await response.json();
const results = (data.results || []).map((r: any, i: number) => {
const parts = [
`${i + 1}. **${r.title || "Untitled"}**`,
` ${r.url}`,
];
if (r.publishedDate) parts.push(` Published: ${r.publishedDate}`);
if (r.text) parts.push(` ${r.text.slice(0, 500)}`);
return parts.join("\n");
});
return {
success: true,
output: `Found ${results.length} results for "${query}":\n\n${results.join("\n\n")}`,
};
} catch (err: any) {
if (err.name === "TimeoutError" || err.message?.includes("timeout")) {
return { success: false, output: "Exa search timed out (15s)" };
}
return { success: false, output: `Web search error: ${err.message || err}` };
}
}