// Indexer service (T027)
// Pipeline:
// 1. list markdown files
// 2. read & parse frontmatter (lazy import gray-matter)
// 3. validate required fields (using validation util)
// 4. generate chunks via chunkMarkdown
// 5. tokenize title & chunk text (cache tokens for performance)
// 6. attach freshness info & risk classification metadata placeholder
// Output: index object { documents: [...], chunks: [...], stats }
import path from 'path';
import matter from 'gray-matter';
import { chunkMarkdown } from '../core/chunk.mjs';
import { tokenize } from '../core/tokenize.mjs';
import { evaluateFreshness } from '../core/freshness.mjs';
import { classifyRisk } from '../core/risk-classify.mjs';
import { suggestSeverity } from '../core/severity.mjs';
import { validateFrontmatter } from '../utils/validation.mjs';
import { ValidationError } from '../utils/errors.mjs';
export function createIndexer({ fsAdapter, config, logger }) {
if (!fsAdapter) throw new ValidationError('fsAdapter required');
if (!config) throw new ValidationError('config required');
logger = logger || { log: () => {} };
async function buildIndex() {
const start = Date.now();
const mdFiles = await fsAdapter.listMarkdown();
logger.log('index.list', { count: mdFiles.length });
const documents = [];
const chunks = [];
const tokenCache = new Map();
let skipped = 0;
for (const file of mdFiles) {
const rel = file.relative;
const { content } = await fsAdapter.read(rel);
let fm;
try {
const parsed = matter(content);
fm = parsed.data || {};
validateFrontmatter(fm);
const body = parsed.content || '';
const docId = rel.replace(/\\/g, '/');
const title = fm.title || path.basename(rel, path.extname(rel));
const freshness = evaluateFreshness(fm.updated || fm.date || fm.last_updated);
// risk classification (combine frontmatter hints with body tokens)
const { risks, safeOps } = classifyRisk(body, fm);
const severity = suggestSeverity(risks.join(' '));
const docRecord = { id: docId, relPath: rel, title, freshness, risks, safeOps, severity };
documents.push(docRecord);
logger.log('index.doc', { id: docId, chunks: undefined });
const docChunks = chunkMarkdown(body).map((c, idx) => {
const key = c.text;
let tokens = tokenCache.get(key);
if (!tokens) {
tokens = tokenize(key);
tokenCache.set(key, tokens);
}
const headingParts = c.headingPath ? c.headingPath.split(' > ') : [];
return {
id: `${docId}::${idx}`,
docId,
heading: headingParts.length > 0 ? headingParts[headingParts.length - 1] : null,
headingPath: c.headingPath || '',
depth: headingParts.length,
text: c.text,
tokens
};
});
chunks.push(...docChunks);
logger.log('index.doc.chunks', { id: docId, chunkCount: docChunks.length });
} catch (err) {
skipped++;
logger.log('index.skip', { file: rel, error: err.message });
continue;
}
}
const durationMs = Date.now() - start;
return Object.freeze({
documents,
chunks,
stats: { files: mdFiles.length, documents: documents.length, chunks: chunks.length, skipped, durationMs }
});
}
return Object.freeze({ buildIndex });
}
export default { createIndexer };