Readwise MCP Enhanced

content-converter.ts•10.2 KiB

import { parse } from 'node-html-parser'; // @ts-ignore - wordsninja doesn't have TypeScript declarations import WordsNinjaPack from 'wordsninja'; // Global WordsNinja instance let wordsNinja: any = null; // Initialize WordsNinja (lazy loading) async function initWordSegmentation() { if (!wordsNinja) { wordsNinja = new WordsNinjaPack(); await wordsNinja.loadDictionary(); } return wordsNinja; } // Segment merged words in text async function segmentMergedWords(text: string): Promise<string> { if (!text || text.length < 10) return text; try { console.error('[Readwise-MCP] [segmentMergedWords] Starting word segmentation...'); const ninja = await initWordSegmentation(); console.error('[Readwise-MCP] [segmentMergedWords] WordsNinja initialized successfully'); // Split text into words and process each potential merged word const words = text.split(/\s+/); const processedWords = []; let segmentationCount = 0; console.error(`[Readwise-MCP] [segmentMergedWords] Processing ${words.length} words...`); for (const word of words) { // Lower the threshold to catch merged words like "whatyou" (7 chars) if (word.length >= 6 && /^[a-zA-Z]+$/.test(word)) { try { const segmented = ninja.splitSentence(word.toLowerCase()); if (segmented && segmented.length > 1) { // Successfully segmented, join with spaces processedWords.push(segmented.join(' ')); segmentationCount++; console.error(`[Readwise-MCP] [segmentMergedWords] Segmented "${word}" → "${segmented.join(' ')}"`); } else { // Couldn't segment, keep original processedWords.push(word); } } catch (error) { // Error in segmentation, keep original word processedWords.push(word); console.error(`[Readwise-MCP] [segmentMergedWords] Error segmenting "${word}":`, error); } } else { // Word is normal length or contains non-letters, keep as is processedWords.push(word); } } console.error(`[Readwise-MCP] [segmentMergedWords] Completed: processed ${words.length} words, segmented ${segmentationCount} words`); return processedWords.join(' '); } catch (error) { // If segmentation fails, return original text console.error('[Readwise-MCP] [segmentMergedWords] Failed:', error); return text; } } // Convert URL content using jina.ai export async function convertWithJina(url: string): Promise<string> { const jinaUrl = `https://r.jina.ai/${url}`; const response = await fetch(jinaUrl, { headers: { 'Accept': 'text/plain', 'User-Agent': 'Readwise-MCP-Server/1.0.0' } }); if (!response.ok) { throw new Error(`Jina conversion failed: ${response.status}`); } return response.text(); } // Extract text content from HTML string export async function extractTextFromHtml(htmlContent: string): Promise<string> { if (!htmlContent?.trim()) { return ''; } console.error('[Readwise-MCP] [extractTextFromHtml] Starting HTML text extraction...'); const root = parse(htmlContent); // Remove non-content elements root.querySelectorAll('script, style, nav, header, footer').forEach(el => el.remove()); // Get title and body text const title = root.querySelector('title')?.text?.trim() || ''; const bodyText = root.querySelector('body')?.text || root.text || ''; // Clean up whitespace while preserving word boundaries let cleanText = bodyText .replace(/\r\n/g, '\n') // Normalize line breaks .replace(/\r/g, '\n') // Convert remaining carriage returns .replace(/\n+/g, ' ') // Convert line breaks to spaces .replace(/\s+/g, ' ') // Collapse multiple spaces to single spaces .replace(/(\w)([A-Z])/g, '$1 $2') // Add space before capital letters if missing .trim(); console.error(`[Readwise-MCP] [extractTextFromHtml] Extracted ${cleanText.length} characters, applying word segmentation...`); // Apply word segmentation to fix merged words cleanText = await segmentMergedWords(cleanText); console.error(`[Readwise-MCP] [extractTextFromHtml] Final text length: ${cleanText.length} characters`); return title ? `${title}\n\n${cleanText}` : cleanText; } // Process content with pagination and filtering options export function processContentWithOptions( content: string, options: { maxLength?: number; startOffset?: number; filterKeywords?: string[]; } = {} ): { content: string; truncated: boolean; totalLength: number; extractedSections?: string[]; debug?: any; } { if (!content?.trim()) { return { content: '', truncated: false, totalLength: 0 }; } const totalLength = content.length; let processedContent = content; let truncated = false; let extractedSections: string[] | undefined; let keywordFilteringApplied = false; // Filter by keywords if provided if (options.filterKeywords && options.filterKeywords.length > 0) { const sections: string[] = []; let paragraphs = content.split(/\n\s*\n/); // If paragraph splitting results in only one section (continuous text like YouTube transcripts), // fall back to sentence-based splitting if (paragraphs.length === 1 && paragraphs[0].length > 2000) { paragraphs = content.split(/\.\s+/).map(sentence => sentence.trim() + '.'); // If sentences are still too long, use sliding window approach if (paragraphs.some(p => p.length > 1500)) { paragraphs = []; const windowSize = 800; const overlap = 100; for (let i = 0; i < content.length; i += (windowSize - overlap)) { const chunk = content.substring(i, i + windowSize); if (chunk.trim()) { paragraphs.push(chunk.trim()); } } } } for (let i = 0; i < paragraphs.length; i++) { const paragraph = paragraphs[i]; const hasKeyword = options.filterKeywords.some(keyword => paragraph.toLowerCase().includes(keyword.toLowerCase()) ); if (hasKeyword) { // Include surrounding context: previous 2 and next 2 paragraphs for more context const contextStart = Math.max(0, i - 2); const contextEnd = Math.min(paragraphs.length, i + 3); const contextChunk = paragraphs.slice(contextStart, contextEnd).join('. ').trim(); // Avoid duplicate sections by checking if we already have overlapping content const isDuplicate = sections.some(existing => existing.includes(paragraph.trim()) || paragraph.trim().includes(existing) ); if (!isDuplicate) { sections.push(contextChunk); } } } if (sections.length > 0) { extractedSections = sections; keywordFilteringApplied = true; // Instead of joining all sections, distribute the maxLength across chunks const maxLength = options.maxLength || 50000; const chunks: string[] = []; let totalUsed = 0; // Calculate how much space each chunk should get const avgChunkSize = Math.floor(maxLength / Math.min(sections.length, 10)); // Max 10 chunks const minChunkSize = 200; // Minimum viable chunk size for (let i = 0; i < sections.length && totalUsed < maxLength && chunks.length < 10; i++) { const section = sections[i]; const remainingSpace = maxLength - totalUsed; const chunkSize = Math.max(minChunkSize, Math.min(avgChunkSize, remainingSpace)); if (section.length <= chunkSize) { // Section fits entirely chunks.push(section); totalUsed += section.length + 4; // +4 for separator } else if (remainingSpace >= minChunkSize) { // Truncate section to fit chunks.push(section.substring(0, chunkSize - 3) + '...'); totalUsed += chunkSize + 4; } else { // Not enough space left break; } } processedContent = chunks.join('\n\n--- \n\n'); } else { processedContent = '[No content found matching the specified keywords]'; } } // Apply offset if specified const startOffset = options.startOffset || 0; if (startOffset > 0 && startOffset < processedContent.length) { processedContent = processedContent.substring(startOffset); truncated = true; } // Apply max length if specified (but skip if keyword filtering already handled it) if (!keywordFilteringApplied) { const maxLength = options.maxLength || 50000; // Default to 50k chars if (processedContent.length > maxLength) { processedContent = processedContent.substring(0, maxLength); truncated = true; } } return { content: processedContent, truncated, totalLength, extractedSections, // Debug info for troubleshooting debug: options.filterKeywords ? { keywordCount: options.filterKeywords.length, originalLength: content.length, paragraphCount: options.filterKeywords ? content.split(/\n\s*\n/).length : 0, keywordFilteringApplied, finalLength: processedContent.length } : undefined }; } // Convert URL content to LLM-friendly text export async function convertUrlToText(url: string, category?: string): Promise<string> { if (!url?.trim()) { return ''; } try { // Use jina for articles and PDFs, lightweight HTML parsing for others const shouldUseJina = !category || category === 'article' || category === 'pdf'; if (shouldUseJina) { return await convertWithJina(url); } else { // For non-article/pdf content, we'll rely on HTML content from Readwise // This function is now mainly used as a fallback const response = await fetch(url, { headers: { 'User-Agent': 'Readwise-MCP-Server/1.0.0', 'Accept': 'text/html,application/xhtml+xml' } }); if (!response.ok) { throw new Error(`HTML fetch failed: ${response.status}`); } const html = await response.text(); return await extractTextFromHtml(html); } } catch (error) { console.warn('Error converting URL to text:', error); return '[Content unavailable - conversion error]'; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/arnaldo-delisio/readwise-mcp-enhanced'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content-converter.ts•10.2 KiB