bm25.ts•3.77 kB
/**
 * BM25 알고리즘 구현
 * 한국어 문서 검색을 위한 확률적 랭킹 함수
 */
export interface DocumentChunk {
  id: number;
  chunkId: number;
  originTitle: string;
  text: string;
  wordCount: number;
}
export interface SearchResult {
  id: number;
  chunkId: number;
  score: number;
}
export class BM25Calculator {
  private readonly allChunks: DocumentChunk[];
  private readonly totalCount: number;
  private readonly averageDocLength: number;
  private readonly N: number;
  constructor(
    chunks: DocumentChunk[],
    private readonly k1: number = 1.2,
    private readonly b: number = 0.75
  ) {
    this.allChunks = chunks;
    this.totalCount = chunks.reduce((count, chunk) => count + chunk.wordCount, 0);
    this.averageDocLength = this.totalCount / chunks.length;
    this.N = chunks.length;
  }
  /**
   * 주어진 키워드로 문서 검색
   * @param keywords 검색할 키워드 (정규식 패턴)
   * @returns 점수가 높은 순으로 정렬된 검색 결과
   */
  calculate(keywords: string): SearchResult[] {
    const { termFrequencies, docFrequencies } = this.calculateFrequencies(keywords);
    const scores = this.calculateScore(termFrequencies, docFrequencies);
    // 점수 기준 내림차순 정렬
    scores.sort((a, b) =>
      b.score !== a.score ? b.score - a.score : b.totalTF - a.totalTF
    );
    return scores.map(({ id, score, chunkId }) => ({ id, chunkId, score }));
  }
  /**
   * 문서별 단어 빈도 계산
   */
  private calculateFrequencies(query: string) {
    const pattern = new RegExp(query, "gi");
    const termFrequencies: Record<number, Record<string, number>> = {};
    const docFrequencies: Record<string, number> = {};
    for (const chunk of this.allChunks) {
      const text = chunk.text;
      const matches = Array.from(text.matchAll(pattern));
      const termCounts: Record<string, number> = {};
      // 매치된 단어들의 빈도 계산
      for (const match of matches) {
        const term = match[0].toLowerCase();
        termCounts[term] = (termCounts[term] || 0) + 1;
      }
      if (Object.keys(termCounts).length > 0) {
        termFrequencies[chunk.chunkId] = termCounts;
        
        // 문서 빈도 업데이트
        for (const term of Object.keys(termCounts)) {
          docFrequencies[term] = (docFrequencies[term] || 0) + 1;
        }
      }
    }
    return { termFrequencies, docFrequencies };
  }
  /**
   * BM25 점수 계산
   */
  private calculateScore(
    termFrequencies: Record<number, Record<string, number>>,
    docFrequencies: Record<string, number>
  ): Array<{ id: number; chunkId: number; score: number; totalTF: number }> {
    return this.allChunks
      .filter((chunk) => termFrequencies[chunk.chunkId])
      .map((chunk) => {
        const tf = termFrequencies[chunk.chunkId];
        const len = chunk.wordCount;
        // BM25 점수 계산
        const score = Object.keys(tf)
          .map((term) => {
            const df = docFrequencies[term];
            const idf = Math.log((this.N - df + 0.5) / (df + 0.5));
            const numerator = tf[term] * (this.k1 + 1);
            const denominator =
              tf[term] +
              this.k1 * (1 - this.b + this.b * (len / this.averageDocLength));
            return idf * (numerator / denominator);
          })
          .reduce((sum, v) => sum + v, 0);
        const totalTF = Object.values(tf).reduce((sum, v) => sum + v, 0);
        return {
          id: chunk.id,
          chunkId: chunk.chunkId,
          score,
          totalTF
        };
      });
  }
}
/**
 * 정규식 특수문자 이스케이프
 */
export function escapeRegExp(string: string): string {
  return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}