import {DocumentChunk, MarkdownDocument} from "./types.js";
import {TokenEstimator} from "./token-estimator.js";
export class BaaSDocument {
private chunks: DocumentChunk[] = [];
constructor(
private readonly keywordSet: Set<string>,
private readonly document: MarkdownDocument,
private readonly documentId: number
) {
this.generateChunks();
}
getId(): number {
return this.documentId;
}
getTitle(): string {
return this.document.metadata.title;
}
getDescription(): string {
return this.document.metadata.description;
}
getUrl(): string {
return this.document.url;
}
getContent(): string {
return this.document.content;
}
getKeywords(): Set<string> {
return this.keywordSet;
}
getMetadataKeywords(): string[] {
return this.document.metadata.keywords;
}
getChunks(): DocumentChunk[] {
return this.chunks;
}
private generateChunks(): void {
const MAX_TOKENS_PER_CHUNK = 2000;
const content = this.document.content;
const estimatedTokens = this.estimateTokens(content);
// 작은 문서는 단일 청크로 처리
if (estimatedTokens <= MAX_TOKENS_PER_CHUNK) {
this.chunks.push({
id: this.documentId,
chunkId: 0,
originTitle: this.document.metadata.title,
text: this.addContext(content, this.document.metadata.title),
rawText: content,
wordCount: this.countWords(content),
estimatedTokens: estimatedTokens,
headerStack: [this.document.metadata.title],
});
} else {
// 토큰 기반으로 문서를 청킹
const sections = this.splitIntoSections(content);
const chunks = this.splitByTokenLimit(sections, MAX_TOKENS_PER_CHUNK);
chunks.forEach((chunk, index) => {
this.chunks.push({
id: this.documentId,
chunkId: index,
originTitle: this.document.metadata.title,
text: this.addContext(chunk.text, chunk.header),
rawText: chunk.text,
wordCount: this.countWords(chunk.text),
estimatedTokens: this.estimateTokens(chunk.text),
headerStack: chunk.headerStack,
});
});
}
}
private splitIntoSections(content: string): { text: string; header: string; headerStack: string[] }[] {
const sections: { text: string; header: string; headerStack: string[] }[] = [];
const lines = content.split('\n');
let currentSection = '';
let currentHeader = this.document.metadata.title;
let headerStack: string[] = [this.document.metadata.title];
for (const line of lines) {
const headingMatch = line.match(/^(#{1,6})\s*(.+)$/);
if (headingMatch) {
// Save previous section
if (currentSection.trim()) {
sections.push({
text: currentSection.trim(),
header: currentHeader,
headerStack: [...headerStack]
});
}
// Start new section
const level = headingMatch[1].length;
const title = headingMatch[2].trim();
// Update header stack
headerStack = headerStack.slice(0, level);
headerStack[level - 1] = title;
currentHeader = title;
currentSection = line + '\n';
} else {
currentSection += line + '\n';
}
}
// Add last section
if (currentSection.trim()) {
sections.push({
text: currentSection.trim(),
header: currentHeader,
headerStack: [...headerStack]
});
}
return sections;
}
/**
* 섹션들을 토큰 제한에 맞춰 청크로 분할
*/
private splitByTokenLimit(
sections: { text: string; header: string; headerStack: string[] }[],
maxTokens: number
): { text: string; header: string; headerStack: string[] }[] {
const chunks: { text: string; header: string; headerStack: string[] }[] = [];
let currentChunk = '';
let currentTokens = 0;
let currentHeader = this.document.metadata.title;
let currentHeaderStack = [this.document.metadata.title];
for (const section of sections) {
// 섹션 텍스트 길이 확인 (100자 미만은 스킵)
if (section.text.trim().length < 100) continue;
const sectionTokens = this.estimateTokens(section.text);
// 단일 섹션이 토큰 제한을 초과하는 경우
if (sectionTokens > maxTokens) {
// 현재 청크가 있으면 저장
if (currentChunk.trim()) {
chunks.push({
text: currentChunk.trim(),
header: currentHeader,
headerStack: [...currentHeaderStack]
});
}
// 큰 섹션을 문단별로 분할
const paragraphs = this.splitSectionByParagraphs(section, maxTokens);
chunks.push(...paragraphs);
// 리셋
currentChunk = '';
currentTokens = 0;
continue;
}
// 현재 청크에 섹션 추가 가능한지 확인
if (currentTokens + sectionTokens > maxTokens && currentChunk.trim()) {
// 현재 청크 저장
chunks.push({
text: currentChunk.trim(),
header: currentHeader,
headerStack: [...currentHeaderStack]
});
// 새 청크 시작
currentChunk = section.text;
currentTokens = sectionTokens;
currentHeader = section.header;
currentHeaderStack = section.headerStack;
} else {
// 현재 청크에 추가
if (currentChunk) {
currentChunk += '\n\n' + section.text;
} else {
currentChunk = section.text;
currentHeader = section.header;
currentHeaderStack = section.headerStack;
}
currentTokens += sectionTokens;
}
}
// 마지막 청크 저장
if (currentChunk.trim()) {
chunks.push({
text: currentChunk.trim(),
header: currentHeader,
headerStack: [...currentHeaderStack]
});
}
return chunks;
}
/**
* 큰 섹션을 문단별로 분할
*/
private splitSectionByParagraphs(
section: { text: string; header: string; headerStack: string[] },
maxTokens: number
): { text: string; header: string; headerStack: string[] }[] {
const paragraphs = section.text.split('\n\n').filter(p => p.trim().length > 0);
const chunks: { text: string; header: string; headerStack: string[] }[] = [];
let currentChunk = '';
let currentTokens = 0;
for (const paragraph of paragraphs) {
const paragraphTokens = this.estimateTokens(paragraph);
if (currentTokens + paragraphTokens > maxTokens && currentChunk.trim()) {
chunks.push({
text: currentChunk.trim(),
header: section.header,
headerStack: [...section.headerStack]
});
currentChunk = paragraph;
currentTokens = paragraphTokens;
} else {
if (currentChunk) {
currentChunk += '\n\n' + paragraph;
} else {
currentChunk = paragraph;
}
currentTokens += paragraphTokens;
}
}
// 마지막 청크
if (currentChunk.trim()) {
chunks.push({
text: currentChunk.trim(),
header: section.header,
headerStack: [...section.headerStack]
});
}
return chunks;
}
private addContext(text: string, header: string): string {
return `# ${header}\n\n${text}`;
}
private countWords(text: string): number {
return text.split(/\s+/).filter(word => word.length > 0).length;
}
private estimateTokens(text: string): number {
return TokenEstimator.estimate(text);
}
hasKeyword(keyword: string): boolean {
if (!keyword || keyword.trim().length === 0) {
return false;
}
const normalizedKeyword = keyword.toLowerCase().trim();
return this.keywordSet.has(normalizedKeyword) ||
this.document.content.toLowerCase().includes(normalizedKeyword);
}
// Extract relevant chunks for the query
getRelevantChunks(queryTerms: string[], maxChunks: number = 3): string[] {
const paragraphs = this.document.content
.split('\n\n')
.filter(p => p.trim().length > 50);
const scoredParagraphs = paragraphs.map(paragraph => ({
paragraph,
score: this.scoreChunk(paragraph, queryTerms)
}))
.filter(item => item.score > 0)
.sort((a, b) => b.score - a.score)
.slice(0, maxChunks);
return scoredParagraphs.map(item => item.paragraph.trim());
}
private scoreChunk(chunk: string, queryTerms: string[]): number {
const chunkLower = chunk.toLowerCase();
let score = 0;
for (const term of queryTerms) {
const termLower = term.toLowerCase();
// Count occurrences
const regex = new RegExp(`\\b${termLower}\\b`, 'g');
const matches = chunkLower.match(regex);
const termCount = matches ? matches.length : 0;
// Boost score for title/heading context
if (chunk.includes('#')) {
score += termCount * 2;
} else {
score += termCount;
}
// Boost for code blocks
if (chunk.includes('```')) {
score += termCount * 1.5;
}
}
return score;
}
toJSON() {
return {
id: this.documentId,
title: this.getTitle(),
description: this.getDescription(),
url: this.getUrl(),
keywords: Array.from(this.keywordSet)
};
}
}