import { readFile } from "node:fs/promises";
import { basename, extname, join } from "node:path";
import * as LinkExtractor from "./processor/LinkExtractor.js";
import * as MatterParser from "./processor/MatterParser.js";
import type { DocumentIndex } from "./processor/types.js";
import type { Semaphore } from "./semaphore.js";
export class Indexer {
private documentMap: Map<string, DocumentIndex> = new Map();
private invertedIndex: Map<string, Set<string>> = new Map();
private backlinkIndex: Map<string, Set<string>> = new Map();
public get totalFiles(): number {
return this.documentMap.size;
}
public async build(
filePaths: string[],
ioSemaphore: Semaphore,
): Promise<void> {
this.clear();
const tasks = filePaths.map((filePath) =>
this.processFile(filePath, ioSemaphore),
);
await Promise.all(tasks);
this.buildBacklinkIndex();
}
public search(keyword: string): DocumentIndex[] {
const lowerKeyword = keyword.toLowerCase().trim();
if (!lowerKeyword) {
return [];
}
const matchingFilePaths = this.invertedIndex.get(lowerKeyword);
if (!matchingFilePaths) {
return [];
}
return Array.from(matchingFilePaths)
.map((filePath) => this.documentMap.get(filePath))
.filter((documentIndex) => documentIndex !== undefined);
}
public getDocument(filePath: string): DocumentIndex | null {
return this.documentMap.get(filePath) || null;
}
public getAllDocuments(): DocumentIndex[] {
return Array.from(this.documentMap.values());
}
public getBacklinks(filePath: string): string[] {
const targetName = this.normalizeLink(
join(filePath).split(/[/]/).pop() || "",
);
if (!targetName) {
return [];
}
const backlinks = this.backlinkIndex.get(targetName);
return backlinks ? Array.from(backlinks) : [];
}
public clear(): void {
this.documentMap.clear();
this.invertedIndex.clear();
this.backlinkIndex.clear();
}
private async processFile(
filePath: string,
ioSemaphore: Semaphore,
): Promise<void> {
await ioSemaphore.acquire();
try {
const fileContent = await readFile(filePath, "utf-8");
const { frontmatter, content } = MatterParser.parse(fileContent);
const imageLinks = LinkExtractor.extractImageLinks(content);
const documentLinks = LinkExtractor.extractDocumentLinks(content);
const index: DocumentIndex = {
filePath,
frontmatter,
contentLength: content.length,
imageLinks,
documentLinks,
};
this.documentMap.set(filePath, index);
this.buildInvertedIndexForFile(index, content);
} catch (error) {
console.error(`파일 인덱싱 중 오류 발생: ${filePath}`, error);
} finally {
ioSemaphore.release();
}
}
private buildInvertedIndexForFile(
index: DocumentIndex,
content: string,
): void {
const tokens = new Set<string>();
index.filePath
.toLowerCase()
.split(/[/\s\-.]+/)
.forEach((t) => {
t && tokens.add(t);
});
const extension = extname(index.filePath);
const fileBasename = basename(index.filePath, extension);
tokens.add(fileBasename);
if (index.frontmatter.title) {
index.frontmatter.title
.toLowerCase()
.split(/\s+/)
.forEach((t) => {
t && tokens.add(t);
});
}
if (index.frontmatter.tags) {
index.frontmatter.tags.forEach((tag) => {
tokens.add(tag.toLowerCase());
});
}
content
.toLowerCase()
.match(/[a-z0-9가-힣]+/g)
?.forEach((token) => {
tokens.add(token);
});
const filename = index.filePath.split(/[/]/).pop() || "";
if (filename) {
tokens.add(filename.toLowerCase().replace(/\.mdx?$/, ""));
}
const headerRegex = /^#+\s+(.*)/gm;
let match: RegExpExecArray | null = null;
match = headerRegex.exec(content);
while (match !== null) {
if (match[1]) {
tokens.add(match[1].toLowerCase().trim());
}
match = headerRegex.exec(content);
}
for (const token of tokens) {
if (!this.invertedIndex.has(token)) {
this.invertedIndex.set(token, new Set());
}
this.invertedIndex.get(token)?.add(index.filePath);
}
}
private buildBacklinkIndex(): void {
this.backlinkIndex.clear();
for (const [sourcePath, sourceDoc] of this.documentMap.entries()) {
for (const targetLink of sourceDoc.documentLinks) {
const normalizedTarget = this.normalizeLink(targetLink);
if (!this.backlinkIndex.has(normalizedTarget)) {
this.backlinkIndex.set(normalizedTarget, new Set());
}
this.backlinkIndex.get(normalizedTarget)?.add(sourcePath);
}
}
}
private normalizeLink(link: string): string {
return link.toLowerCase().replace(/\.mdx?$/, "");
}
}