Skip to main content
Glama

OneSearch MCP Server

utils.ts5.37 kB
/** * The following code is based on * https://github.com/bytedance/UI-TARS-desktop/tree/main/packages/agent-infra/search/browser-search * * Copyright (c) 2025 Bytedance, Inc. and its affiliates. * SPDX-License-Identifier: Apache-2.0 */ import Turndown from 'turndown'; import { gfm } from 'turndown-plugin-gfm'; import { defaultLogger as logger } from '@agent-infra/logger'; import { Page } from '../browser/index.js'; import UserAgent from 'user-agents'; /** * Safely parses a URL string into a URL object * @param url - The URL string to parse * @returns URL object or null if invalid */ const parseUrl = (url: string) => { try { return new URL(url); } catch { return null; } }; /** * Determines if a domain should be skipped based on a blocklist * @param url - The URL to check * @returns True if the domain should be skipped, false otherwise */ export const shouldSkipDomain = (url: string) => { const parsed = parseUrl(url); if (!parsed) return true; const { hostname } = parsed; return [ 'reddit.com', 'www.reddit.com', 'x.com', 'twitter.com', 'www.twitter.com', 'youtube.com', 'www.youtube.com', ].includes(hostname); }; /** * Applies various stealth techniques to make the browser appear more like a regular user browser * @param page - Puppeteer page object */ export async function applyStealthScripts(page: Page) { const userAgent = new UserAgent({ deviceCategory: 'desktop', }).toString(); await page.setBypassCSP(true); await page.setUserAgent(userAgent); /** * https://intoli.com/blog/not-possible-to-block-chrome-headless/chrome-headless-test.html */ await page.evaluate(() => { /** * Override the navigator.webdriver property * The webdriver read-only property of the navigator interface indicates whether the user agent is controlled by automation. * @see https://developer.mozilla.org/en-US/docs/Web/API/Navigator/webdriver */ Object.defineProperty(navigator, 'webdriver', { get: () => undefined, }); // Mock languages and plugins to mimic a real browser Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'], }); Object.defineProperty(navigator, 'plugins', { get: () => [{}, {}, {}, {}, {}], }); // Redefine the headless property Object.defineProperty(navigator, 'headless', { get: () => false, }); // Override the permissions API const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => parameters.name === 'notifications' ? Promise.resolve({ state: Notification.permission, } as PermissionStatus) : originalQuery(parameters); }); } /** * Sets up request interception to block unnecessary resources and apply stealth techniques * @param page - Puppeteer page object */ export async function interceptRequest(page: Page) { await applyStealthScripts(page); await page.setRequestInterception(true); page.on('request', (request) => { const resourceType = request.resourceType(); if (resourceType !== 'document') { return request.abort(); } if (request.isNavigationRequest()) { return request.continue(); } return request.abort(); }); } /** * Interface representing extracted page information */ interface PageInfo { /** Page title */ title: string; /** Page content in HTML format */ content: string; } /** * !NOTE: This function runs in the context of the browser page, not Node.js * * Extracts readable content from a web page using Readability * @param window Browser window object * @param readabilityScript Readability library script as string * @returns Extracted page information (title and content) */ export function extractPageInformation( window: Window, readabilityScript: string, ): PageInfo { const Readability = new Function( 'module', `${readabilityScript}\nreturn module.exports`, )({}); const document = window.document; // Remove non-content elements to improve extraction quality document .querySelectorAll( 'script,noscript,style,link,svg,img,video,iframe,canvas,.reflist', ) .forEach((el) => el.remove()); // Parse the document using Readability const article = new Readability(document).parse(); const content = article?.content || ''; const title = document.title; return { content, title: article?.title || title, }; } export interface ToMarkdownOptions extends Turndown.Options { gfmExtension?: boolean; } /** * Convert HTML content to Markdown format * @param html HTML string * @param options Conversion options * @returns Markdown string */ export function toMarkdown( html: string, options: ToMarkdownOptions = {}, ): string { if (!html) return ''; try { const { codeBlockStyle = 'fenced', headingStyle = 'atx', emDelimiter = '*', strongDelimiter = '**', gfmExtension = true, } = options; const turndown = new Turndown({ codeBlockStyle, headingStyle, emDelimiter, strongDelimiter, }); if (gfmExtension) { turndown.use(gfm); } return turndown.turndown(html); } catch (error) { logger.error('Error converting HTML to Markdown:', error); return html; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/yokingma/one-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server