Skip to main content
Glama
WhenYouAreStrange

goodbook-mcp

pdf-parser.js6.14 kB
import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); class PDFParser { constructor() { this.pdfPath = path.join(__dirname, '..', 'menu.pdf'); this.parsedContent = null; this.sections = {}; } async initialize() { try { console.log('Инициализация парсера PDF...'); // Проверяем наличие PDF файла await fs.access(this.pdfPath); console.log('PDF файл найден!'); // Парсим PDF файл console.log('Читаем PDF файл...'); const pdfBuffer = await fs.readFile(this.pdfPath); const uint8Array = new Uint8Array(pdfBuffer); const loadingTask = getDocument({ data: uint8Array, verbosity: 0 // Отключаем логирование }); const pdf = await loadingTask.promise; // Извлекаем текст из всех страниц let fullText = ''; for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items.map(item => item.str).join(' '); fullText += pageText + '\n'; } this.parsedContent = fullText; console.log(`Контент из PDF загружен. Длина текста: ${this.parsedContent.length} символов`); // Парсим контент на секции this.parseIntoSections(); return true; } catch (error) { console.error('Ошибка при инициализации парсера:', error); if (error.code === 'ENOENT') { console.error('Ошибка: PDF файл menu.pdf не найден в корневой папке проекта!'); console.error('Пожалуйста, поместите файл menu.pdf в папку:', path.dirname(this.pdfPath)); } else { console.error('Ошибка парсинга PDF:', error.message); } return false; } } parseIntoSections() { if (!this.parsedContent) return; // Split content by common section indicators const lines = this.parsedContent.split('\n'); let currentSection = 'general'; let currentContent = []; this.sections = { general: [] }; for (const line of lines) { const trimmedLine = line.trim(); // Skip empty lines if (!trimmedLine) continue; // Check if this line looks like a section header if (this.isSectionHeader(trimmedLine)) { // Save previous section if (currentContent.length > 0) { this.sections[currentSection] = this.sections[currentSection] || []; this.sections[currentSection].push(...currentContent); currentContent = []; } // Start new section currentSection = this.normalizeSectionName(trimmedLine); this.sections[currentSection] = this.sections[currentSection] || []; } else { currentContent.push(trimmedLine); } } // Add remaining content if (currentContent.length > 0) { this.sections[currentSection] = this.sections[currentSection] || []; this.sections[currentSection].push(...currentContent); } console.log(`Parsed content into ${Object.keys(this.sections).length} sections:`, Object.keys(this.sections)); } isSectionHeader(line) { // Common patterns for section headers const patterns = [ /^[А-ЯЁ\s]{3,}$/, // All caps Cyrillic /^[A-Z\s]{3,}$/, // All caps Latin /^\d+\.\s*[А-ЯЁA-Z]/, // Numbered sections /^Глава\s+\d+/i, // Chapter /^Chapter\s+\d+/i, // Chapter (English) /^Раздел\s+\d+/i, // Section /^Section\s+\d+/i, // Section (English) ]; return patterns.some(pattern => pattern.test(line)); } normalizeSectionName(header) { return header .toLowerCase() .replace(/[^\wА-Яё\s]/g, '') .replace(/\s+/g, '_') .substring(0, 50); } searchContent(query, sectionName = null) { if (!this.parsedContent) { return { error: 'PDF content not loaded' }; } const searchText = sectionName ? this.sections[sectionName]?.join(' ') || '' : this.parsedContent; if (!searchText) { return { error: `Section '${sectionName}' not found` }; } const queryLower = query.toLowerCase(); const results = []; const lines = searchText.split('\n'); for (let i = 0; i < lines.length; i++) { const line = lines[i]; if (line.toLowerCase().includes(queryLower)) { // Get context (previous and next lines) const context = { line: i, content: line.trim(), before: i > 0 ? lines[i - 1].trim() : '', after: i < lines.length - 1 ? lines[i + 1].trim() : '' }; results.push(context); } } return { query, section: sectionName || 'all', totalResults: results.length, results: results.slice(0, 20) // Limit to first 20 results }; } getSections() { return Object.keys(this.sections).map(name => ({ name, contentLength: this.sections[name].length, preview: this.sections[name].slice(0, 3).join(' ').substring(0, 200) + '...' })); } getContent(sectionName = null, limit = 1000) { if (!this.parsedContent) { return { error: 'PDF content not loaded' }; } if (sectionName) { const section = this.sections[sectionName]; if (!section) { return { error: `Section '${sectionName}' not found` }; } return { section: sectionName, content: section.join('\n').substring(0, limit) }; } return { section: 'all', content: this.parsedContent.substring(0, limit) }; } } export default PDFParser;

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/WhenYouAreStrange/goodbook-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server