Skip to main content
Glama
dataExtractor.ts10.9 kB
/** * Data Extractor Agent * Extracts structured data from various unstructured sources * including text, documents, and semi-structured data. * * Features: * - Schema-based extraction * - Entity recognition * - Table extraction * - Key-value pair extraction * - Format conversion * - Validation and normalization * - n8n webhook compatible */ import type { AgentLLM } from './react.js'; import { logger } from '../logging/logger.js'; export interface DataSchema { name: string; fields: Array<{ name: string; type: 'string' | 'number' | 'boolean' | 'date' | 'array' | 'object'; description?: string; required?: boolean; example?: string; }>; } export interface ExtractionResult { success: boolean; data: Record<string, unknown>; confidence: number; missingFields: string[]; validationErrors: string[]; processingTime: number; } export interface EntityResult { type: string; value: string; confidence: number; context?: string; } export interface TableData { headers: string[]; rows: string[][]; format?: 'csv' | 'json' | 'markdown'; } /** * Data Extractor Agent * Extracts structured data from unstructured text */ export class DataExtractor { private llm: AgentLLM; constructor(llm: AgentLLM) { this.llm = llm; } /** * Extract data according to a defined schema */ async extractWithSchema(content: string, schema: DataSchema): Promise<ExtractionResult> { const startTime = Date.now(); logger.info('Extracting data with schema', { schema: schema.name }); const schemaDescription = schema.fields .map((f) => `- ${f.name} (${f.type}${f.required ? ', required' : ''}): ${f.description || ''}`) .join('\n'); const response = await this.llm.chat({ messages: [ { role: 'system', content: `You are a data extraction expert. Extract data according to the schema. Return ONLY valid JSON matching the schema. Use null for missing optional fields.`, }, { role: 'user', content: `Schema "${schema.name}": ${schemaDescription} Content to extract from: --- ${content.slice(0, 6000)} --- Extract data as JSON object.`, }, ], maxTokens: 2000, temperature: 0.1, }); try { const extracted = JSON.parse(response.content); // Validate required fields const missingFields: string[] = []; const validationErrors: string[] = []; for (const field of schema.fields) { if (field.required && (extracted[field.name] === null || extracted[field.name] === undefined)) { missingFields.push(field.name); } // Type validation if (extracted[field.name] !== null && extracted[field.name] !== undefined) { const actualType = Array.isArray(extracted[field.name]) ? 'array' : typeof extracted[field.name]; if (field.type !== 'date' && actualType !== field.type) { validationErrors.push(`${field.name}: expected ${field.type}, got ${actualType}`); } } } const confidence = missingFields.length === 0 && validationErrors.length === 0 ? 0.95 : Math.max(0.3, 0.95 - missingFields.length * 0.1 - validationErrors.length * 0.05); return { success: missingFields.length === 0, data: extracted, confidence, missingFields, validationErrors, processingTime: Date.now() - startTime, }; } catch { return { success: false, data: {}, confidence: 0, missingFields: schema.fields.filter((f) => f.required).map((f) => f.name), validationErrors: ['Failed to parse extraction result'], processingTime: Date.now() - startTime, }; } } /** * Extract named entities from text */ async extractEntities( content: string, entityTypes: string[] = ['person', 'organization', 'location', 'date', 'money', 'email', 'phone', 'url'] ): Promise<EntityResult[]> { const response = await this.llm.chat({ messages: [ { role: 'system', content: `You are a named entity recognition expert. Extract entities from the text. Return JSON array: [{"type": "entity_type", "value": "extracted_value", "confidence": 0.0-1.0, "context": "surrounding text"}] Entity types to find: ${entityTypes.join(', ')}`, }, { role: 'user', content: `Extract entities from:\n\n${content.slice(0, 5000)}`, }, ], maxTokens: 2000, temperature: 0.1, }); try { return JSON.parse(response.content); } catch { return []; } } /** * Extract tabular data from text */ async extractTable(content: string, expectedColumns?: string[]): Promise<TableData | null> { const columnsHint = expectedColumns ? `Expected columns: ${expectedColumns.join(', ')}` : 'Infer columns from content.'; const response = await this.llm.chat({ messages: [ { role: 'system', content: `You are a table extraction expert. Extract tabular data from the text. Return JSON: {"headers": ["col1", "col2"], "rows": [["val1", "val2"], ["val3", "val4"]]} ${columnsHint}`, }, { role: 'user', content: `Extract table from:\n\n${content.slice(0, 5000)}`, }, ], maxTokens: 2000, temperature: 0.1, }); try { return JSON.parse(response.content); } catch { return null; } } /** * Extract key-value pairs from text */ async extractKeyValues(content: string): Promise<Record<string, string>> { const response = await this.llm.chat({ messages: [ { role: 'system', content: 'Extract key-value pairs from the text. Return a flat JSON object with string keys and string values.', }, { role: 'user', content: `Extract key-value pairs from:\n\n${content.slice(0, 5000)}`, }, ], maxTokens: 1500, temperature: 0.1, }); try { return JSON.parse(response.content); } catch { return {}; } } /** * Convert data between formats */ async convertFormat( data: string, fromFormat: 'csv' | 'json' | 'xml' | 'yaml' | 'text', toFormat: 'csv' | 'json' | 'xml' | 'yaml' | 'markdown' ): Promise<string> { const response = await this.llm.chat({ messages: [ { role: 'system', content: `Convert the data from ${fromFormat} to ${toFormat} format. Return only the converted data.`, }, { role: 'user', content: `Convert this ${fromFormat} data to ${toFormat}:\n\n${data.slice(0, 6000)}`, }, ], maxTokens: 3000, temperature: 0, }); return response.content; } /** * Batch extract with same schema */ async batchExtract( contents: Array<{ id: string; content: string }>, schema: DataSchema ): Promise< Array<{ id: string; result: ExtractionResult; }> > { const results: Array<{ id: string; result: ExtractionResult }> = []; for (const item of contents.slice(0, 50)) { const result = await this.extractWithSchema(item.content, schema); results.push({ id: item.id, result }); } return results; } /** * Create a schema from example data */ async inferSchema( exampleData: string, schemaName = 'InferredSchema' ): Promise<DataSchema> { const response = await this.llm.chat({ messages: [ { role: 'system', content: `Analyze the data and infer a schema for extraction. Return JSON: { "name": "schema_name", "fields": [{"name": "field_name", "type": "string|number|boolean|date|array|object", "description": "field description", "required": true|false, "example": "example value"}] }`, }, { role: 'user', content: `Infer a schema from this example data:\n\n${exampleData.slice(0, 4000)}`, }, ], maxTokens: 1500, temperature: 0.2, }); try { const parsed = JSON.parse(response.content); return { name: parsed.name || schemaName, fields: parsed.fields || [], }; } catch { return { name: schemaName, fields: [], }; } } /** * Validate extracted data against rules */ async validate( data: Record<string, unknown>, rules: Array<{ field: string; rule: string }> ): Promise<{ valid: boolean; errors: Array<{ field: string; error: string }>; }> { const rulesText = rules.map((r) => `- ${r.field}: ${r.rule}`).join('\n'); const response = await this.llm.chat({ messages: [ { role: 'system', content: `Validate the data against the rules. Return JSON: {"valid": true|false, "errors": [{"field": "field_name", "error": "error description"}]}`, }, { role: 'user', content: `Data:\n${JSON.stringify(data, null, 2)}\n\nRules:\n${rulesText}`, }, ], maxTokens: 500, temperature: 0.1, }); try { return JSON.parse(response.content); } catch { return { valid: true, errors: [] }; } } } export function createDataExtractor(llm: AgentLLM) { return new DataExtractor(llm); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/babasida246/ai-mcp-gateway'

If you have feedback or need assistance with the MCP directory API, please join our Discord server