Skip to main content
Glama

extract

Extract structured data from any URL as JSON by providing a schema with field names and types for prices, availability, product details, or contact information.

Instructions

Extract structured data from any URL as JSON. Provide a schema describing what fields you want. Schema format: {"fieldName": "type"} where type is one of: string, number, boolean, array, object. Example: {"title": "string", "price": "number", "inStock": "boolean"}. Great for prices, availability, product details, contact info.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
urlYesThe URL to extract data from
schemaYesField names mapped to type strings. Format: {"fieldName": "string|number|boolean|array|object"}. Example: {"title": "string", "price": "number", "inStock": "boolean", "tags": "array"}
contextNoOptional: what you're trying to accomplish (helps LLM extraction accuracy)

Implementation Reference

  • The POST /extract route handler which orchestrates scraping (via tier0 or browser pool) and then performs LLM-based structured data extraction.
    app.post('/extract', async (req: FastifyRequest, reply: FastifyReply) => {
      const body = req.body as ExtractRequestBody;
      const url = body?.url;
      const schema = body?.schema;
    
      if (!url || typeof url !== 'string') {
        return reply.status(400).send({ error: 'url is required' });
      }
      if (!schema || typeof schema !== 'object' || Array.isArray(schema)) {
        return reply.status(400).send({
          error: 'schema_required',
          message: 'schema must be an object mapping field names to type strings',
          example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' },
          validTypes: ['string', 'number', 'boolean', 'array', 'object'],
        });
      }
    
      // Validate schema values
      const validTypes = new Set(['string', 'number', 'boolean', 'array', 'object']);
      for (const [key, val] of Object.entries(schema as Record<string, unknown>)) {
        if (!validTypes.has(val as string)) {
          return reply.status(400).send({
            error: 'invalid_schema_type',
            message: `Invalid type "${val}" for field "${key}"`,
            validTypes: ['string', 'number', 'boolean', 'array', 'object'],
            example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' },
          });
        }
      }
    
      const typedSchema = schema as Schema;
    
      // ── Helper: run LLM extraction + return reply ─────────────────────────
      async function runExtraction(markdown: string, title: string): Promise<ReturnType<typeof reply.send>> {
        let data: Record<string, any>;
        let extractionMethod: string;
        try {
          data = await extractWithLLM(markdown, typedSchema);
          extractionMethod = 'llm';
        } catch (llmErr: any) {
          console.warn('[extract] LLM failed, falling back to regex:', llmErr.message);
          data = extractFromMarkdown(markdown, typedSchema);
          extractionMethod = 'regex-fallback';
        }
        return reply.send({ url, success: true, data, extractionMethod, markdown, title });
      }
    
      // ── Tier 0: plain HTTP fetch (no browser) — fast path for simple pages ─
      try {
        const tier0 = await scrapeUrlTier0(url);
        if (tier0 && tier0.status === 'success' && tier0.markdown) {
          return await runExtraction(tier0.markdown, tier0.title ?? '');
        }
      } catch {
        // tier0 failed silently — fall through to browser pool
      }
    
      // ── Tier 1+: browser pool ──────────────────────────────────────────────
      let session: Awaited<ReturnType<typeof acquireSession>> | null = null;
      let hadError = false;
    
      try {
        session = await acquireSession();
        const browser = session.browser as Browser;
    
        const result = await scrapeUrlWithFallback(browser, url, true);
    
        if (result.status !== 'success') {
          hadError = true;
          return reply.status(422).send({
            error: 'Failed to scrape URL',
            reason: result.error || result.status,
            url,
          });
        }
    
        return await runExtraction(result.markdown, result.title ?? '');
      } catch (err: any) {
        hadError = true;
        return reply.status(500).send({ error: 'Extract failed', message: err.message });
      } finally {
        if (session) releaseSession(session, hadError);
      }
    });
  • The main extraction logic using LLMs (OpenClaw, Gemini, or Anthropic) to map page content to the provided JSON schema.
    async function extractWithLLM(markdown: string, schema: Schema): Promise<Record<string, any>> {
      // Truncate markdown to 8000 chars to control token costs
      const content = markdown.slice(0, 8000);
    
      const schemaDesc = Object.entries(schema)
        .map(([key, type]) => `- ${key} (${type})`)
        .join('\n');
    
      const prompt = `Extract the following fields from this webpage content. Return ONLY valid JSON, no explanation.\n\nFields:\n${schemaDesc}\n\nContent:\n${content}`;
    
      // ── 1. OpenClaw gateway (subscription-based, zero marginal cost) ──────────
      const gatewayUrl = process.env.OPENCLAW_GATEWAY_URL;
      const gatewayToken = process.env.OPENCLAW_GATEWAY_TOKEN;
    
      if (gatewayUrl && gatewayToken) {
        try {
          const controller = new AbortController();
          const timer = setTimeout(() => controller.abort(), 20000);
          const res = await fetch(`${gatewayUrl}/v1/chat/completions`, {
            method: 'POST',
            headers: { 'Authorization': `Bearer ${gatewayToken}`, 'Content-Type': 'application/json' },
            body: JSON.stringify({
              model: 'openclaw:main',
              messages: [{ role: 'user', content: prompt }],
              max_tokens: 1024
            }),
            signal: controller.signal
          });
          clearTimeout(timer);
          if (!res.ok) throw new Error(`Gateway returned ${res.status}`);
          const data = await res.json() as any;
          const text = data.choices?.[0]?.message?.content || '';
          const cleaned = text.replace(/```json\n?/g, '').replace(/```\n?/g, '').trim();
          return JSON.parse(cleaned);
        } catch (gwErr: any) {
          console.warn('[extract] Gateway failed, trying Gemini:', gwErr.message);
        }
      }
    
      // ── 2. Gemini Flash 2.0 (primary — ~$0.00002/call) ───────────────────────
      const geminiKey = process.env.GEMINI_API_KEY;
      if (geminiKey) {
        try {
          const controller = new AbortController();
          const timer = setTimeout(() => controller.abort(), 20000);
          const res = await fetch(
            `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${geminiKey}`,
            {
              method: 'POST',
              headers: { 'Content-Type': 'application/json' },
              body: JSON.stringify({
                contents: [{ parts: [{ text: prompt }] }],
                generationConfig: { maxOutputTokens: 1024, temperature: 0.1 }
              }),
              signal: controller.signal
            }
          );
          clearTimeout(timer);
          if (!res.ok) throw new Error(`Gemini returned ${res.status}`);
          const data = await res.json() as any;
          const text = data.candidates?.[0]?.content?.parts?.[0]?.text || '';
          const cleaned = text.replace(/```json\n?/g, '').replace(/```\n?/g, '').trim();
          return JSON.parse(cleaned);
        } catch (gemErr: any) {
          console.warn('[extract] Gemini failed, trying Anthropic:', gemErr.message);
        }
      }
    
      // ── 3. Anthropic Haiku (last resort fallback) ─────────────────────────────
      const apiKey = process.env.ANTHROPIC_API_KEY;
      if (!apiKey) throw new Error('No LLM configured (set GEMINI_API_KEY, OPENCLAW_GATEWAY_URL, or ANTHROPIC_API_KEY)');
    
      const { default: Anthropic } = await import('@anthropic-ai/sdk');
      const client = new Anthropic({ apiKey });
      const message = await client.messages.create({
        model: 'claude-3-haiku-20240307',
        max_tokens: 1024,
        messages: [{ role: 'user', content: prompt }]
      });
      const text = (message.content[0] as any).text;
      const cleaned = text.replace(/```json\n?/g, '').replace(/```\n?/g, '').trim();
      return JSON.parse(cleaned);
    }
  • src/extract.ts:232-319 (registration)
    Registration function for the /extract route in the Fastify application.
    export async function registerExtractRoutes(app: FastifyInstance): Promise<void> {
      app.post('/extract', async (req: FastifyRequest, reply: FastifyReply) => {
        const body = req.body as ExtractRequestBody;
        const url = body?.url;
        const schema = body?.schema;
    
        if (!url || typeof url !== 'string') {
          return reply.status(400).send({ error: 'url is required' });
        }
        if (!schema || typeof schema !== 'object' || Array.isArray(schema)) {
          return reply.status(400).send({
            error: 'schema_required',
            message: 'schema must be an object mapping field names to type strings',
            example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' },
            validTypes: ['string', 'number', 'boolean', 'array', 'object'],
          });
        }
    
        // Validate schema values
        const validTypes = new Set(['string', 'number', 'boolean', 'array', 'object']);
        for (const [key, val] of Object.entries(schema as Record<string, unknown>)) {
          if (!validTypes.has(val as string)) {
            return reply.status(400).send({
              error: 'invalid_schema_type',
              message: `Invalid type "${val}" for field "${key}"`,
              validTypes: ['string', 'number', 'boolean', 'array', 'object'],
              example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' },
            });
          }
        }
    
        const typedSchema = schema as Schema;
    
        // ── Helper: run LLM extraction + return reply ─────────────────────────
        async function runExtraction(markdown: string, title: string): Promise<ReturnType<typeof reply.send>> {
          let data: Record<string, any>;
          let extractionMethod: string;
          try {
            data = await extractWithLLM(markdown, typedSchema);
            extractionMethod = 'llm';
          } catch (llmErr: any) {
            console.warn('[extract] LLM failed, falling back to regex:', llmErr.message);
            data = extractFromMarkdown(markdown, typedSchema);
            extractionMethod = 'regex-fallback';
          }
          return reply.send({ url, success: true, data, extractionMethod, markdown, title });
        }
    
        // ── Tier 0: plain HTTP fetch (no browser) — fast path for simple pages ─
        try {
          const tier0 = await scrapeUrlTier0(url);
          if (tier0 && tier0.status === 'success' && tier0.markdown) {
            return await runExtraction(tier0.markdown, tier0.title ?? '');
          }
        } catch {
          // tier0 failed silently — fall through to browser pool
        }
    
        // ── Tier 1+: browser pool ──────────────────────────────────────────────
        let session: Awaited<ReturnType<typeof acquireSession>> | null = null;
        let hadError = false;
    
        try {
          session = await acquireSession();
          const browser = session.browser as Browser;
    
          const result = await scrapeUrlWithFallback(browser, url, true);
    
          if (result.status !== 'success') {
            hadError = true;
            return reply.status(422).send({
              error: 'Failed to scrape URL',
              reason: result.error || result.status,
              url,
            });
          }
    
          return await runExtraction(result.markdown, result.title ?? '');
        } catch (err: any) {
          hadError = true;
          return reply.status(500).send({ error: 'Extract failed', message: err.message });
        } finally {
          if (session) releaseSession(session, hadError);
        }
      });
    
      console.log('[extract] POST /extract registered ($0.01/call, LLM-based extraction with claude-3-haiku)');
    }
Install Server

Other Tools

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/kc23go/anybrowse'

If you have feedback or need assistance with the MCP directory API, please join our Discord server