/**
* TOON (Token-Oriented Object Notation) Multi-Format Encoder
*
* Converts various data formats to TOON for reduced token usage with LLMs.
* Supports: JSON, CSV, XML, YAML, HTML tables, SQL results
*/
class ToonEncoder {
/**
* Auto-detect format and encode to TOON
*/
static encode(input, options = {}) {
const format = options.format || this.detectFormat(input);
const data = this.parse(input, format);
return this.toToon(data, options);
}
/**
* Detect input format
*/
static detectFormat(input) {
if (typeof input !== 'string') return 'object';
const trimmed = input.trim();
// JSON
if ((trimmed.startsWith('{') && trimmed.endsWith('}')) ||
(trimmed.startsWith('[') && trimmed.endsWith(']'))) {
try {
JSON.parse(trimmed);
return 'json';
} catch {}
}
// XML/HTML
if (trimmed.startsWith('<') && trimmed.endsWith('>')) {
if (trimmed.includes('<table') || trimmed.includes('<tr')) return 'html-table';
return 'xml';
}
// YAML (basic detection)
if (trimmed.includes(':\n') || trimmed.match(/^[\w-]+:\s/m)) {
return 'yaml';
}
// CSV/TSV
if (trimmed.includes(',') && trimmed.includes('\n')) {
const lines = trimmed.split('\n');
const commas = lines[0].split(',').length;
if (lines.every(l => l.split(',').length === commas || l.trim() === '')) {
return 'csv';
}
}
if (trimmed.includes('\t') && trimmed.includes('\n')) {
return 'tsv';
}
return 'text';
}
/**
* Parse input to normalized data structure
*/
static parse(input, format) {
if (typeof input !== 'string') return input;
switch (format) {
case 'json':
return JSON.parse(input);
case 'csv':
return this.parseCsv(input, ',');
case 'tsv':
return this.parseCsv(input, '\t');
case 'xml':
return this.parseXml(input);
case 'html-table':
return this.parseHtmlTable(input);
case 'yaml':
return this.parseYaml(input);
case 'object':
return input;
default:
return { _raw: input };
}
}
/**
* Parse CSV/TSV to array of objects
*/
static parseCsv(input, delimiter = ',') {
const lines = input.trim().split('\n').filter(l => l.trim());
if (lines.length < 2) return [];
const headers = this.parseCsvLine(lines[0], delimiter);
return lines.slice(1).map(line => {
const values = this.parseCsvLine(line, delimiter);
const obj = {};
headers.forEach((h, i) => {
obj[h.trim()] = this.parseValue(values[i]?.trim() || '');
});
return obj;
});
}
/**
* Parse a CSV line handling quoted values
*/
static parseCsvLine(line, delimiter) {
const values = [];
let current = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const char = line[i];
if (char === '"') {
inQuotes = !inQuotes;
} else if (char === delimiter && !inQuotes) {
values.push(current);
current = '';
} else {
current += char;
}
}
values.push(current);
return values;
}
/**
* Parse simple XML to object
*/
static parseXml(input) {
const items = [];
// Match repeating elements (common pattern for data)
const rowMatch = input.match(/<(\w+)>[\s\S]*?<\/\1>/g);
if (rowMatch) {
for (const row of rowMatch) {
const obj = {};
const fieldMatch = row.match(/<(\w+)>([^<]*)<\/\1>/g);
if (fieldMatch) {
for (const field of fieldMatch) {
const m = field.match(/<(\w+)>([^<]*)<\/\1>/);
if (m) {
obj[m[1]] = this.parseValue(m[2]);
}
}
if (Object.keys(obj).length > 0) {
items.push(obj);
}
}
}
}
return items.length > 0 ? items : { _xml: input };
}
/**
* Parse HTML table to array of objects
*/
static parseHtmlTable(input) {
const items = [];
// Extract headers
const headerMatch = input.match(/<th[^>]*>([^<]*)<\/th>/gi);
const headers = headerMatch
? headerMatch.map(h => h.replace(/<[^>]+>/g, '').trim())
: [];
// Extract rows
const rowMatch = input.match(/<tr[^>]*>[\s\S]*?<\/tr>/gi);
if (rowMatch) {
for (const row of rowMatch) {
const cellMatch = row.match(/<td[^>]*>([^<]*)<\/td>/gi);
if (cellMatch && headers.length > 0) {
const obj = {};
cellMatch.forEach((cell, i) => {
const value = cell.replace(/<[^>]+>/g, '').trim();
if (headers[i]) {
obj[headers[i]] = this.parseValue(value);
}
});
if (Object.keys(obj).length > 0) {
items.push(obj);
}
}
}
}
return items;
}
/**
* Parse simple YAML to object
*/
static parseYaml(input) {
const lines = input.trim().split('\n');
const result = [];
let currentObj = null;
let inList = false;
for (const line of lines) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith('#')) continue;
if (trimmed.startsWith('- ')) {
// List item
if (currentObj && Object.keys(currentObj).length > 0) {
result.push(currentObj);
}
currentObj = {};
const content = trimmed.slice(2);
if (content.includes(':')) {
const [key, ...valueParts] = content.split(':');
currentObj[key.trim()] = this.parseValue(valueParts.join(':').trim());
}
inList = true;
} else if (trimmed.includes(':') && inList && currentObj) {
const [key, ...valueParts] = trimmed.split(':');
currentObj[key.trim()] = this.parseValue(valueParts.join(':').trim());
}
}
if (currentObj && Object.keys(currentObj).length > 0) {
result.push(currentObj);
}
return result.length > 0 ? result : this.parseYamlSimple(input);
}
/**
* Parse simple key-value YAML
*/
static parseYamlSimple(input) {
const obj = {};
for (const line of input.trim().split('\n')) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith('#')) continue;
if (trimmed.includes(':')) {
const [key, ...valueParts] = trimmed.split(':');
obj[key.trim()] = this.parseValue(valueParts.join(':').trim());
}
}
return obj;
}
/**
* Parse string value to appropriate type
*/
static parseValue(str) {
if (str === '' || str === null || str === undefined) return '';
if (str === 'true') return true;
if (str === 'false') return false;
if (str === 'null' || str === 'nil') return null;
if (!isNaN(str) && str !== '') return Number(str);
return str;
}
/**
* Convert data to TOON format
*/
static toToon(data, options = {}) {
if (Array.isArray(data) && data.length > 0 && this.isUniformArray(data)) {
return this.toToonTabular(data);
}
if (typeof data === 'object' && data !== null) {
return this.toToonNested(data);
}
return String(data);
}
/**
* Check if array contains uniform objects
*/
static isUniformArray(arr) {
if (!arr.length || typeof arr[0] !== 'object' || arr[0] === null) return false;
const keys = Object.keys(arr[0]).sort().join(',');
return arr.every(item =>
typeof item === 'object' &&
item !== null &&
Object.keys(item).sort().join(',') === keys
);
}
/**
* Convert uniform array to tabular TOON
*/
static toToonTabular(data) {
const keys = Object.keys(data[0]);
const header = `[${keys.join(',')}]`;
const rows = data.map(item => {
return keys.map(key => {
const val = item[key];
if (val === null || val === undefined) return '';
if (typeof val === 'boolean') return val ? 'true' : 'false';
if (typeof val === 'string' && (val.includes(',') || val.includes('\n') || val.includes('"'))) {
return `"${val.replace(/"/g, '""')}"`;
}
return String(val);
}).join(',');
});
return `${header}\n${rows.join('\n')}`;
}
/**
* Convert nested object to TOON
*/
static toToonNested(data, indent = 0) {
const prefix = ' '.repeat(indent);
const lines = [];
for (const [key, value] of Object.entries(data)) {
if (Array.isArray(value)) {
if (this.isUniformArray(value)) {
lines.push(`${prefix}${key}:`);
const tabular = this.toToonTabular(value);
for (const line of tabular.split('\n')) {
lines.push(`${prefix} ${line}`);
}
} else {
lines.push(`${prefix}${key}:`);
for (const item of value) {
if (typeof item === 'object' && item !== null) {
lines.push(`${prefix}- ${this.toToonNested(item, indent + 1).trim()}`);
} else {
lines.push(`${prefix}- ${item}`);
}
}
}
} else if (typeof value === 'object' && value !== null) {
lines.push(`${prefix}${key}:`);
lines.push(this.toToonNested(value, indent + 1));
} else {
const val = typeof value === 'boolean' ? (value ? 'true' : 'false') : value;
lines.push(`${prefix}${key}: ${val}`);
}
}
return lines.join('\n');
}
/**
* Decode TOON back to object
*/
static decode(toon) {
const lines = toon.trim().split('\n');
// Check if tabular format
if (lines[0].startsWith('[') && lines[0].endsWith(']')) {
return this.decodeToonTabular(lines);
}
return this.decodeToonNested(lines);
}
/**
* Decode tabular TOON
*/
static decodeToonTabular(lines) {
const header = lines[0].slice(1, -1).split(',');
const result = [];
for (let i = 1; i < lines.length; i++) {
if (!lines[i].trim()) continue;
const values = this.parseCsvLine(lines[i], ',');
const obj = {};
header.forEach((key, idx) => {
obj[key] = this.parseValue(values[idx]?.trim() || '');
});
result.push(obj);
}
return result;
}
/**
* Decode nested TOON (simplified)
*/
static decodeToonNested(lines) {
// For now, return a simple parse - full implementation would be more complex
const result = {};
let currentKey = null;
for (const line of lines) {
const match = line.match(/^(\w+):\s*(.*)$/);
if (match) {
const [, key, value] = match;
if (value) {
result[key] = this.parseValue(value);
} else {
currentKey = key;
result[key] = [];
}
}
}
return result;
}
/**
* Estimate token count (~4 chars per token for English)
*/
static estimateTokens(text) {
return Math.ceil(text.length / 4);
}
/**
* Get encoding stats
*/
static getStats(original, encoded) {
const origChars = original.length;
const encChars = encoded.length;
const origTokens = this.estimateTokens(original);
const encTokens = this.estimateTokens(encoded);
return {
original: { chars: origChars, tokens: origTokens },
encoded: { chars: encChars, tokens: encTokens },
savings: {
chars: origChars - encChars,
tokens: origTokens - encTokens,
percent: ((1 - encChars / origChars) * 100).toFixed(1) + '%'
}
};
}
}
module.exports = { ToonEncoder };