Skip to main content
Glama
csv.ts6.4 kB
import { SupportedFileType } from '@superglue/shared'; import Papa from 'papaparse'; import { Readable } from 'stream'; import { DetectionPriority, FileParsingStrategy } from '../strategy.js'; export class CSVStrategy implements FileParsingStrategy { readonly fileType = SupportedFileType.CSV; readonly priority = DetectionPriority.HEURISTIC_TEXT; canHandle(buffer: Buffer): boolean { return isLikelyCSV(buffer); } async parse(buffer: Buffer): Promise<any> { return parseCSV(buffer); } } function isLikelyCSV(buffer: Buffer): boolean { const sampleSize = Math.min(buffer.length, 8192); const sample = buffer.subarray(0, sampleSize).toString('utf8'); const lines = sample.split(/\r?\n/).filter(line => line.length > 0).slice(0, 10); if (lines.length < 2) return false; const delimiters = [',', '\t', ';', '|']; for (const delimiter of delimiters) { const delimiterCounts = lines.map(line => { return (line.match(new RegExp(`\\${delimiter}`, 'g')) || []).length; }); if (Math.max(...delimiterCounts) === 0) continue; const nonZeroCounts = delimiterCounts.filter(count => count > 0); if (nonZeroCounts.length >= lines.length * 0.7) { const avgCount = nonZeroCounts.reduce((a, b) => a + b, 0) / nonZeroCounts.length; const consistentLines = delimiterCounts.filter(count => count === 0 || Math.abs(count - avgCount) <= Math.max(2, avgCount * 0.3) ).length; if (consistentLines >= lines.length * 0.8) { return true; } } } return false; } async function detectCSVHeaders(sample: Buffer, delimiter: string): Promise<{ headerValues: string[], headerRowIndex: number, delimiter: string }> { return new Promise<{ headerValues: string[], headerRowIndex: number, delimiter: string }>((resolve, reject) => { Papa.parse(Readable.from(sample), { preview: 100, header: false, skipEmptyLines: false, delimiter: delimiter, complete: (result) => { const headerRowIndex = result.data .reduce<number>((maxIndex: number, row: any[], currentIndex: number, rows: any[][]) => (row.length > (rows[maxIndex] as any[]).length) ? currentIndex : maxIndex , 0); const headerValues = (result.data[headerRowIndex] as string[]) .map((value: string, index: number) => value?.trim() || `Column ${index + 1}`); resolve({ headerValues, headerRowIndex, delimiter }); }, error: (error) => reject(error) }); }); } function detectDelimiter(buffer: Buffer): string { const sampleSize = Math.min(buffer.length, 32768); const sample = buffer.subarray(0, sampleSize).toString('utf8'); const delimiters = [',', '|', '\t', ';', ':']; const counts = delimiters.map(delimiter => ({ delimiter, count: countUnescapedDelimiter(sample, delimiter) })); const detectedDelimiter = counts.reduce((prev, curr) => { return curr.count > prev.count ? curr : prev; }); if (detectedDelimiter.count === 0) { return ','; } return detectedDelimiter.delimiter; } function countUnescapedDelimiter(text: string, delimiter: string): number { let count = 0; let inQuotes = false; let prevChar = ''; let delimiterLength = delimiter.length; for (let i = 0; i < text.length; i++) { const currentChar = text[i]; const searchChar = text.substring(i, i + delimiterLength); if (currentChar === '"' && prevChar !== '\\') { inQuotes = !inQuotes; } else if (searchChar === delimiter && !inQuotes) { count++; } prevChar = currentChar; } return count; } export async function parseCSV(buffer: Buffer): Promise<any> { const results: any[] = []; const metadata: any[] = []; const sampleSize = Math.min(buffer.length, 32768); const sample = buffer.subarray(0, sampleSize); const delimiter = detectDelimiter(sample); const { headerValues, headerRowIndex } = await detectCSVHeaders(sample, delimiter); let rawHeader = []; let currentLine = -1; return new Promise((resolve, reject) => { Papa.parse(Readable.from(buffer), { header: false, skipEmptyLines: false, delimiter: delimiter, step: (result: { data: any[] }, parser) => { try { currentLine++; if (currentLine == headerRowIndex) { rawHeader = result.data.filter(Boolean).reduce((acc, value, index) => { acc[`${index}`] = value; return acc; }, {}); return; } else if (currentLine < headerRowIndex) { if (result.data == null || result.data?.filter(Boolean).length == 0) return; metadata.push(result?.data); return; } if (result.data == null || result.data.map((value: any) => value?.trim()).filter(Boolean).length == 0) return; const dataObject: { [key: string]: any } = {}; for (let i = 0; i < headerValues.length; i++) { dataObject[headerValues[i]] = result.data[i]; } results.push(dataObject); } catch (error) { parser.abort(); } }, complete: () => { if (metadata.length > 0) { resolve({ data: results, metadata }); } else { if (results.length > 0) { resolve(results); } else { resolve(rawHeader); } } }, error: (error) => { reject(error); }, }); }); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/superglue-ai/superglue'

If you have feedback or need assistance with the MCP directory API, please join our Discord server