csv.ts•6.4 kB
import { SupportedFileType } from '@superglue/shared';
import Papa from 'papaparse';
import { Readable } from 'stream';
import { DetectionPriority, FileParsingStrategy } from '../strategy.js';
export class CSVStrategy implements FileParsingStrategy {
readonly fileType = SupportedFileType.CSV;
readonly priority = DetectionPriority.HEURISTIC_TEXT;
canHandle(buffer: Buffer): boolean {
return isLikelyCSV(buffer);
}
async parse(buffer: Buffer): Promise<any> {
return parseCSV(buffer);
}
}
function isLikelyCSV(buffer: Buffer): boolean {
const sampleSize = Math.min(buffer.length, 8192);
const sample = buffer.subarray(0, sampleSize).toString('utf8');
const lines = sample.split(/\r?\n/).filter(line => line.length > 0).slice(0, 10);
if (lines.length < 2) return false;
const delimiters = [',', '\t', ';', '|'];
for (const delimiter of delimiters) {
const delimiterCounts = lines.map(line => {
return (line.match(new RegExp(`\\${delimiter}`, 'g')) || []).length;
});
if (Math.max(...delimiterCounts) === 0) continue;
const nonZeroCounts = delimiterCounts.filter(count => count > 0);
if (nonZeroCounts.length >= lines.length * 0.7) {
const avgCount = nonZeroCounts.reduce((a, b) => a + b, 0) / nonZeroCounts.length;
const consistentLines = delimiterCounts.filter(count =>
count === 0 || Math.abs(count - avgCount) <= Math.max(2, avgCount * 0.3)
).length;
if (consistentLines >= lines.length * 0.8) {
return true;
}
}
}
return false;
}
async function detectCSVHeaders(sample: Buffer, delimiter: string): Promise<{ headerValues: string[], headerRowIndex: number, delimiter: string }> {
return new Promise<{ headerValues: string[], headerRowIndex: number, delimiter: string }>((resolve, reject) => {
Papa.parse(Readable.from(sample), {
preview: 100,
header: false,
skipEmptyLines: false,
delimiter: delimiter,
complete: (result) => {
const headerRowIndex = result.data
.reduce<number>((maxIndex: number, row: any[], currentIndex: number, rows: any[][]) =>
(row.length > (rows[maxIndex] as any[]).length)
? currentIndex
: maxIndex
, 0);
const headerValues = (result.data[headerRowIndex] as string[])
.map((value: string, index: number) => value?.trim() || `Column ${index + 1}`);
resolve({ headerValues, headerRowIndex, delimiter });
},
error: (error) => reject(error)
});
});
}
function detectDelimiter(buffer: Buffer): string {
const sampleSize = Math.min(buffer.length, 32768);
const sample = buffer.subarray(0, sampleSize).toString('utf8');
const delimiters = [',', '|', '\t', ';', ':'];
const counts = delimiters.map(delimiter => ({
delimiter,
count: countUnescapedDelimiter(sample, delimiter)
}));
const detectedDelimiter = counts.reduce((prev, curr) => {
return curr.count > prev.count ? curr : prev;
});
if (detectedDelimiter.count === 0) {
return ',';
}
return detectedDelimiter.delimiter;
}
function countUnescapedDelimiter(text: string, delimiter: string): number {
let count = 0;
let inQuotes = false;
let prevChar = '';
let delimiterLength = delimiter.length;
for (let i = 0; i < text.length; i++) {
const currentChar = text[i];
const searchChar = text.substring(i, i + delimiterLength);
if (currentChar === '"' && prevChar !== '\\') {
inQuotes = !inQuotes;
}
else if (searchChar === delimiter && !inQuotes) {
count++;
}
prevChar = currentChar;
}
return count;
}
export async function parseCSV(buffer: Buffer): Promise<any> {
const results: any[] = [];
const metadata: any[] = [];
const sampleSize = Math.min(buffer.length, 32768);
const sample = buffer.subarray(0, sampleSize);
const delimiter = detectDelimiter(sample);
const { headerValues, headerRowIndex } = await detectCSVHeaders(sample, delimiter);
let rawHeader = [];
let currentLine = -1;
return new Promise((resolve, reject) => {
Papa.parse(Readable.from(buffer), {
header: false,
skipEmptyLines: false,
delimiter: delimiter,
step: (result: { data: any[] }, parser) => {
try {
currentLine++;
if (currentLine == headerRowIndex) {
rawHeader = result.data.filter(Boolean).reduce((acc, value, index) => {
acc[`${index}`] = value;
return acc;
}, {});
return;
}
else if (currentLine < headerRowIndex) {
if (result.data == null || result.data?.filter(Boolean).length == 0) return;
metadata.push(result?.data);
return;
}
if (result.data == null || result.data.map((value: any) => value?.trim()).filter(Boolean).length == 0) return;
const dataObject: { [key: string]: any } = {};
for (let i = 0; i < headerValues.length; i++) {
dataObject[headerValues[i]] = result.data[i];
}
results.push(dataObject);
} catch (error) {
parser.abort();
}
},
complete: () => {
if (metadata.length > 0) {
resolve({
data: results,
metadata
});
}
else {
if (results.length > 0) {
resolve(results);
}
else {
resolve(rawHeader);
}
}
},
error: (error) => {
reject(error);
},
});
});
}