extract
Extract structured data from any URL as JSON by providing a schema with field names and types for prices, availability, product details, or contact information.
Instructions
Extract structured data from any URL as JSON. Provide a schema describing what fields you want. Schema format: {"fieldName": "type"} where type is one of: string, number, boolean, array, object. Example: {"title": "string", "price": "number", "inStock": "boolean"}. Great for prices, availability, product details, contact info.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | The URL to extract data from | |
| schema | Yes | Field names mapped to type strings. Format: {"fieldName": "string|number|boolean|array|object"}. Example: {"title": "string", "price": "number", "inStock": "boolean", "tags": "array"} | |
| context | No | Optional: what you're trying to accomplish (helps LLM extraction accuracy) |
Implementation Reference
- src/extract.ts:233-316 (handler)The POST /extract route handler which orchestrates scraping (via tier0 or browser pool) and then performs LLM-based structured data extraction.
app.post('/extract', async (req: FastifyRequest, reply: FastifyReply) => { const body = req.body as ExtractRequestBody; const url = body?.url; const schema = body?.schema; if (!url || typeof url !== 'string') { return reply.status(400).send({ error: 'url is required' }); } if (!schema || typeof schema !== 'object' || Array.isArray(schema)) { return reply.status(400).send({ error: 'schema_required', message: 'schema must be an object mapping field names to type strings', example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' }, validTypes: ['string', 'number', 'boolean', 'array', 'object'], }); } // Validate schema values const validTypes = new Set(['string', 'number', 'boolean', 'array', 'object']); for (const [key, val] of Object.entries(schema as Record<string, unknown>)) { if (!validTypes.has(val as string)) { return reply.status(400).send({ error: 'invalid_schema_type', message: `Invalid type "${val}" for field "${key}"`, validTypes: ['string', 'number', 'boolean', 'array', 'object'], example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' }, }); } } const typedSchema = schema as Schema; // ── Helper: run LLM extraction + return reply ───────────────────────── async function runExtraction(markdown: string, title: string): Promise<ReturnType<typeof reply.send>> { let data: Record<string, any>; let extractionMethod: string; try { data = await extractWithLLM(markdown, typedSchema); extractionMethod = 'llm'; } catch (llmErr: any) { console.warn('[extract] LLM failed, falling back to regex:', llmErr.message); data = extractFromMarkdown(markdown, typedSchema); extractionMethod = 'regex-fallback'; } return reply.send({ url, success: true, data, extractionMethod, markdown, title }); } // ── Tier 0: plain HTTP fetch (no browser) — fast path for simple pages ─ try { const tier0 = await scrapeUrlTier0(url); if (tier0 && tier0.status === 'success' && tier0.markdown) { return await runExtraction(tier0.markdown, tier0.title ?? ''); } } catch { // tier0 failed silently — fall through to browser pool } // ── Tier 1+: browser pool ────────────────────────────────────────────── let session: Awaited<ReturnType<typeof acquireSession>> | null = null; let hadError = false; try { session = await acquireSession(); const browser = session.browser as Browser; const result = await scrapeUrlWithFallback(browser, url, true); if (result.status !== 'success') { hadError = true; return reply.status(422).send({ error: 'Failed to scrape URL', reason: result.error || result.status, url, }); } return await runExtraction(result.markdown, result.title ?? ''); } catch (err: any) { hadError = true; return reply.status(500).send({ error: 'Extract failed', message: err.message }); } finally { if (session) releaseSession(session, hadError); } }); - src/extract.ts:21-103 (helper)The main extraction logic using LLMs (OpenClaw, Gemini, or Anthropic) to map page content to the provided JSON schema.
async function extractWithLLM(markdown: string, schema: Schema): Promise<Record<string, any>> { // Truncate markdown to 8000 chars to control token costs const content = markdown.slice(0, 8000); const schemaDesc = Object.entries(schema) .map(([key, type]) => `- ${key} (${type})`) .join('\n'); const prompt = `Extract the following fields from this webpage content. Return ONLY valid JSON, no explanation.\n\nFields:\n${schemaDesc}\n\nContent:\n${content}`; // ── 1. OpenClaw gateway (subscription-based, zero marginal cost) ────────── const gatewayUrl = process.env.OPENCLAW_GATEWAY_URL; const gatewayToken = process.env.OPENCLAW_GATEWAY_TOKEN; if (gatewayUrl && gatewayToken) { try { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), 20000); const res = await fetch(`${gatewayUrl}/v1/chat/completions`, { method: 'POST', headers: { 'Authorization': `Bearer ${gatewayToken}`, 'Content-Type': 'application/json' }, body: JSON.stringify({ model: 'openclaw:main', messages: [{ role: 'user', content: prompt }], max_tokens: 1024 }), signal: controller.signal }); clearTimeout(timer); if (!res.ok) throw new Error(`Gateway returned ${res.status}`); const data = await res.json() as any; const text = data.choices?.[0]?.message?.content || ''; const cleaned = text.replace(/```json\n?/g, '').replace(/```\n?/g, '').trim(); return JSON.parse(cleaned); } catch (gwErr: any) { console.warn('[extract] Gateway failed, trying Gemini:', gwErr.message); } } // ── 2. Gemini Flash 2.0 (primary — ~$0.00002/call) ─────────────────────── const geminiKey = process.env.GEMINI_API_KEY; if (geminiKey) { try { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), 20000); const res = await fetch( `https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=${geminiKey}`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ contents: [{ parts: [{ text: prompt }] }], generationConfig: { maxOutputTokens: 1024, temperature: 0.1 } }), signal: controller.signal } ); clearTimeout(timer); if (!res.ok) throw new Error(`Gemini returned ${res.status}`); const data = await res.json() as any; const text = data.candidates?.[0]?.content?.parts?.[0]?.text || ''; const cleaned = text.replace(/```json\n?/g, '').replace(/```\n?/g, '').trim(); return JSON.parse(cleaned); } catch (gemErr: any) { console.warn('[extract] Gemini failed, trying Anthropic:', gemErr.message); } } // ── 3. Anthropic Haiku (last resort fallback) ───────────────────────────── const apiKey = process.env.ANTHROPIC_API_KEY; if (!apiKey) throw new Error('No LLM configured (set GEMINI_API_KEY, OPENCLAW_GATEWAY_URL, or ANTHROPIC_API_KEY)'); const { default: Anthropic } = await import('@anthropic-ai/sdk'); const client = new Anthropic({ apiKey }); const message = await client.messages.create({ model: 'claude-3-haiku-20240307', max_tokens: 1024, messages: [{ role: 'user', content: prompt }] }); const text = (message.content[0] as any).text; const cleaned = text.replace(/```json\n?/g, '').replace(/```\n?/g, '').trim(); return JSON.parse(cleaned); } - src/extract.ts:232-319 (registration)Registration function for the /extract route in the Fastify application.
export async function registerExtractRoutes(app: FastifyInstance): Promise<void> { app.post('/extract', async (req: FastifyRequest, reply: FastifyReply) => { const body = req.body as ExtractRequestBody; const url = body?.url; const schema = body?.schema; if (!url || typeof url !== 'string') { return reply.status(400).send({ error: 'url is required' }); } if (!schema || typeof schema !== 'object' || Array.isArray(schema)) { return reply.status(400).send({ error: 'schema_required', message: 'schema must be an object mapping field names to type strings', example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' }, validTypes: ['string', 'number', 'boolean', 'array', 'object'], }); } // Validate schema values const validTypes = new Set(['string', 'number', 'boolean', 'array', 'object']); for (const [key, val] of Object.entries(schema as Record<string, unknown>)) { if (!validTypes.has(val as string)) { return reply.status(400).send({ error: 'invalid_schema_type', message: `Invalid type "${val}" for field "${key}"`, validTypes: ['string', 'number', 'boolean', 'array', 'object'], example: { title: 'string', price: 'number', inStock: 'boolean', tags: 'array' }, }); } } const typedSchema = schema as Schema; // ── Helper: run LLM extraction + return reply ───────────────────────── async function runExtraction(markdown: string, title: string): Promise<ReturnType<typeof reply.send>> { let data: Record<string, any>; let extractionMethod: string; try { data = await extractWithLLM(markdown, typedSchema); extractionMethod = 'llm'; } catch (llmErr: any) { console.warn('[extract] LLM failed, falling back to regex:', llmErr.message); data = extractFromMarkdown(markdown, typedSchema); extractionMethod = 'regex-fallback'; } return reply.send({ url, success: true, data, extractionMethod, markdown, title }); } // ── Tier 0: plain HTTP fetch (no browser) — fast path for simple pages ─ try { const tier0 = await scrapeUrlTier0(url); if (tier0 && tier0.status === 'success' && tier0.markdown) { return await runExtraction(tier0.markdown, tier0.title ?? ''); } } catch { // tier0 failed silently — fall through to browser pool } // ── Tier 1+: browser pool ────────────────────────────────────────────── let session: Awaited<ReturnType<typeof acquireSession>> | null = null; let hadError = false; try { session = await acquireSession(); const browser = session.browser as Browser; const result = await scrapeUrlWithFallback(browser, url, true); if (result.status !== 'success') { hadError = true; return reply.status(422).send({ error: 'Failed to scrape URL', reason: result.error || result.status, url, }); } return await runExtraction(result.markdown, result.title ?? ''); } catch (err: any) { hadError = true; return reply.status(500).send({ error: 'Extract failed', message: err.message }); } finally { if (session) releaseSession(session, hadError); } }); console.log('[extract] POST /extract registered ($0.01/call, LLM-based extraction with claude-3-haiku)'); }