Skip to main content
Glama

arXiv MCP Server

by Mnehmos
index.ts19.3 kB
#!/usr/bin/env node import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { CallToolRequestSchema, ErrorCode, ListToolsRequestSchema, McpError, } from '@modelcontextprotocol/sdk/types.js'; import axios from 'axios'; import fs from 'fs-extra'; import * as path from 'path'; // pdf-parse will be imported dynamically where needed // Base URL for arXiv API const ARXIV_API_BASE_URL = 'http://export.arxiv.org/api/query'; // Directory for temporary PDF storage const TEMP_PDF_DIR = path.join(process.cwd(), 'temp', 'pdfs'); // Interface for search parameters interface SearchParams { search_query?: string; id_list?: string; start?: number; max_results?: number; sortBy?: string; sortOrder?: string; } // Interface for paper search arguments interface SearchPapersArgs { query?: string; category?: string; author?: string; title?: string; abstract?: string; start?: number; max_results?: number; sort_by?: string; sort_order?: string; } // Interface for get paper arguments interface GetPaperArgs { paper_id: string; } // Interface for category search arguments interface SearchByCategoryArgs { category: string; start?: number; max_results?: number; sort_by?: string; sort_order?: string; } // Interface for get paper content arguments interface GetPaperContentArgs { paper_id: string; } export class ArxivServer { private server: Server; constructor() { this.server = new Server( { name: 'arxiv-mcp-server', version: '0.1.0', }, { capabilities: { tools: {}, }, } ); this.setupToolHandlers(); // Error handling this.server.onerror = (error) => console.error('[MCP Error]', error); process.on('SIGINT', async () => { await this.server.close(); process.exit(0); }); } /** * Expose private methods for testing purposes * This allows tests to access these methods without type casting */ // Property for testing purposes only public _testMethods = { searchPapers: this.searchPapers.bind(this), getPaper: this.getPaper.bind(this), searchByCategory: this.searchByCategory.bind(this), getPaperContent: this.getPaperContent.bind(this), queryArxiv: this.queryArxiv.bind(this), processArxivResponse: this.processArxivResponse.bind(this), downloadPdf: this.downloadPdf.bind(this), extractTextFromPdf: this.extractTextFromPdf.bind(this), }; private setupToolHandlers() { this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'search_papers', description: 'Search for papers on arXiv by various criteria', inputSchema: { type: 'object', properties: { query: { type: 'string', description: 'General search query across all fields', }, category: { type: 'string', description: 'arXiv category (e.g., cs.AI, physics.optics)', }, author: { type: 'string', description: 'Author name', }, title: { type: 'string', description: 'Words in the title', }, abstract: { type: 'string', description: 'Words in the abstract', }, start: { type: 'number', description: 'Starting index for pagination (0-based)', }, max_results: { type: 'number', description: 'Maximum number of results to return (max 2000)', }, sort_by: { type: 'string', description: 'Sort by: relevance, lastUpdatedDate, submittedDate', enum: ['relevance', 'lastUpdatedDate', 'submittedDate'], }, sort_order: { type: 'string', description: 'Sort order: ascending or descending', enum: ['ascending', 'descending'], }, }, }, }, { name: 'get_paper', description: 'Get details about a specific paper by its arXiv ID', inputSchema: { type: 'object', properties: { paper_id: { type: 'string', description: 'arXiv paper ID (e.g., 2104.13478 or cs/0001001)', }, }, required: ['paper_id'], }, }, { name: 'search_by_category', description: 'Search for papers in a specific arXiv category', inputSchema: { type: 'object', properties: { category: { type: 'string', description: 'arXiv category (e.g., cs.AI, physics.optics)', }, start: { type: 'number', description: 'Starting index for pagination (0-based)', }, max_results: { type: 'number', description: 'Maximum number of results to return (max 2000)', }, sort_by: { type: 'string', description: 'Sort by: relevance, lastUpdatedDate, submittedDate', enum: ['relevance', 'lastUpdatedDate', 'submittedDate'], }, sort_order: { type: 'string', description: 'Sort order: ascending or descending', enum: ['ascending', 'descending'], }, }, required: ['category'], }, }, { name: 'get_paper_content', description: 'Get the full text content of a paper by downloading and extracting text from its PDF', inputSchema: { type: 'object', properties: { paper_id: { type: 'string', description: 'arXiv paper ID (e.g., 2104.13478 or cs/0001001)', }, }, required: ['paper_id'], }, }, ], })); this.server.setRequestHandler(CallToolRequestSchema, async (request) => { try { switch (request.params.name) { case 'search_papers': return await this.searchPapers(request.params.arguments as unknown as SearchPapersArgs); case 'get_paper': if (!request.params.arguments || typeof request.params.arguments.paper_id !== 'string') { throw new McpError( ErrorCode.InvalidParams, 'Missing or invalid paper_id parameter' ); } return await this.getPaper(request.params.arguments as unknown as GetPaperArgs); case 'search_by_category': if (!request.params.arguments || typeof request.params.arguments.category !== 'string') { throw new McpError( ErrorCode.InvalidParams, 'Missing or invalid category parameter' ); } return await this.searchByCategory(request.params.arguments as unknown as SearchByCategoryArgs); case 'get_paper_content': if (!request.params.arguments || typeof request.params.arguments.paper_id !== 'string') { throw new McpError( ErrorCode.InvalidParams, 'Missing or invalid paper_id parameter' ); } return await this.getPaperContent(request.params.arguments as unknown as GetPaperContentArgs); default: throw new McpError( ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}` ); } } catch (error) { if (axios.isAxiosError(error)) { return { content: [ { type: 'text', text: `arXiv API error: ${error.response?.data || error.message}`, }, ], isError: true, }; } throw error; } }); } public async searchPapers(args: SearchPapersArgs) { const searchParams: SearchParams = {}; // Build search query const searchTerms: string[] = []; if (args.query) { searchTerms.push(`all:${args.query}`); } if (args.category) { searchTerms.push(`cat:${args.category}`); } if (args.author) { searchTerms.push(`au:${args.author}`); } if (args.title) { searchTerms.push(`ti:${args.title}`); } if (args.abstract) { searchTerms.push(`abs:${args.abstract}`); } if (searchTerms.length > 0) { searchParams.search_query = searchTerms.join('+AND+'); } // Add pagination if (args.start !== undefined) { searchParams.start = args.start; } if (args.max_results !== undefined) { searchParams.max_results = Math.min(args.max_results, 2000); // API limit } else { searchParams.max_results = 10; // Default } // Add sorting if (args.sort_by) { searchParams.sortBy = args.sort_by; } if (args.sort_order) { searchParams.sortOrder = args.sort_order; } const response = await this.queryArxiv(searchParams); return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } private async getPaper(args: GetPaperArgs) { const searchParams: SearchParams = { id_list: args.paper_id, }; const response = await this.queryArxiv(searchParams); return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } private async searchByCategory(args: SearchByCategoryArgs) { const searchParams: SearchParams = { search_query: `cat:${args.category}`, }; // Add pagination if (args.start !== undefined) { searchParams.start = args.start; } if (args.max_results !== undefined) { searchParams.max_results = Math.min(args.max_results, 2000); // API limit } else { searchParams.max_results = 10; // Default } // Add sorting if (args.sort_by) { searchParams.sortBy = args.sort_by; } if (args.sort_order) { searchParams.sortOrder = args.sort_order; } const response = await this.queryArxiv(searchParams); return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } private async queryArxiv(params: SearchParams) { try { const response = await axios.get(ARXIV_API_BASE_URL, { params }); // Parse the XML response const xmlData = response.data; // Extract and process the data // For simplicity, we're returning the raw XML data // In a production environment, you would parse this XML into a more usable format return this.processArxivResponse(xmlData); } catch (error) { console.error('Error querying arXiv API:', error); throw error; } } private processArxivResponse(xmlData: string) { try { // Basic XML parsing to extract paper information const papers: any[] = []; // Extract feed information const titleMatch = xmlData.match(/<title[^>]*>(.*?)<\/title>/); const totalResultsMatch = xmlData.match(/<opensearch:totalResults[^>]*>(\d+)<\/opensearch:totalResults>/); const startIndexMatch = xmlData.match(/<opensearch:startIndex[^>]*>(\d+)<\/opensearch:startIndex>/); const itemsPerPageMatch = xmlData.match(/<opensearch:itemsPerPage[^>]*>(\d+)<\/opensearch:itemsPerPage>/); // Extract entry elements const entryRegex = /<entry>(.*?)<\/entry>/gs; let entryMatch; while ((entryMatch = entryRegex.exec(xmlData)) !== null) { const entry = entryMatch[1]; // Extract paper details const idMatch = entry.match(/<id[^>]*>(.*?)<\/id>/); const titleMatch = entry.match(/<title[^>]*>(.*?)<\/title>/); const summaryMatch = entry.match(/<summary[^>]*>(.*?)<\/summary>/); const publishedMatch = entry.match(/<published[^>]*>(.*?)<\/published>/); const updatedMatch = entry.match(/<updated[^>]*>(.*?)<\/updated>/); // Extract authors const authors: string[] = []; const authorRegex = /<author[^>]*>[\s\S]*?<name[^>]*>(.*?)<\/name>[\s\S]*?<\/author>/g; let authorMatch; while ((authorMatch = authorRegex.exec(entry)) !== null) { authors.push(authorMatch[1].trim()); } // Extract categories const categories: string[] = []; const categoryRegex = /<category[^>]*term="([^"]+)"/g; let categoryMatch; while ((categoryMatch = categoryRegex.exec(entry)) !== null) { categories.push(categoryMatch[1]); } // Extract links const links: any[] = []; const linkRegex = /<link[^>]*href="([^"]+)"[^>]*(?:rel="([^"]+)")?[^>]*(?:type="([^"]+)")?[^>]*\/>/g; let linkMatch; while ((linkMatch = linkRegex.exec(entry)) !== null) { links.push({ href: linkMatch[1], rel: linkMatch[2] || 'alternate', type: linkMatch[3] || 'text/html' }); } const paper = { id: idMatch ? idMatch[1].trim() : '', title: titleMatch ? titleMatch[1].trim().replace(/\s+/g, ' ') : '', summary: summaryMatch ? summaryMatch[1].trim().replace(/\s+/g, ' ') : '', authors: authors, published: publishedMatch ? publishedMatch[1].trim() : '', updated: updatedMatch ? updatedMatch[1].trim() : '', categories: categories, links: links, // Extract arXiv ID from the main ID arxiv_id: idMatch ? idMatch[1].replace('http://arxiv.org/abs/', '') : '' }; papers.push(paper); } return { feed_title: titleMatch ? titleMatch[1].trim() : '', total_results: totalResultsMatch ? parseInt(totalResultsMatch[1]) : 0, start_index: startIndexMatch ? parseInt(startIndexMatch[1]) : 0, items_per_page: itemsPerPageMatch ? parseInt(itemsPerPageMatch[1]) : 0, papers: papers }; } catch (error) { console.error('Error parsing arXiv XML response:', error); return { error: 'Failed to parse arXiv response', raw_response: xmlData.substring(0, 1000) + '...' // Truncated for safety }; } } /** * Downloads a PDF file from a URL and saves it to the temporary directory * @param url URL of the PDF to download * @param paperId arXiv paper ID (used for filename) * @returns Path to the downloaded PDF file */ private async downloadPdf(url: string, paperId: string): Promise<string> { try { // Ensure temp directory exists await fs.ensureDir(TEMP_PDF_DIR); // Create a unique filename based on the paper ID const sanitizedPaperId = paperId.replace(/\//g, '_'); const pdfPath = path.join(TEMP_PDF_DIR, `${sanitizedPaperId}.pdf`); // Check if we already have this PDF cached if (await fs.pathExists(pdfPath)) { console.error(`Using cached PDF for ${paperId}`); return pdfPath; } console.error(`Downloading PDF for ${paperId} from ${url}`); // Download the PDF with proper headers // Note: Using responseType 'arraybuffer' to handle binary data const response = await axios.get(url, { responseType: 'arraybuffer', headers: { 'User-Agent': 'arXiv-MCP-Server/0.1.0 (https://github.com/your-username/arxiv-mcp-server)', }, // Add a timeout to prevent hanging on large files timeout: 30000, }); // Save the PDF to disk await fs.outputFile(pdfPath, response.data); return pdfPath; } catch (error) { console.error('Error downloading PDF:', error); throw new Error(`Failed to download PDF: ${error instanceof Error ? error.message : String(error)}`); } } /** * Extracts text content from a PDF file * @param pdfPath Path to the PDF file * @returns Extracted text content */ private async extractTextFromPdf(pdfPath: string): Promise<string> { try { // Read the PDF file const dataBuffer = await fs.readFile(pdfPath); // Dynamically import pdf-parse const pdfParse = (await import('pdf-parse')).default; // Parse the PDF const data = await pdfParse(dataBuffer); // Return the text content return data.text; } catch (error) { console.error('Error extracting text from PDF:', error); throw new Error(`Failed to extract text from PDF: ${error instanceof Error ? error.message : String(error)}`); } } /** * Gets the full text content of a paper by downloading and extracting text from its PDF * @param args Object containing paper_id * @returns Object containing the extracted text content */ private async getPaperContent(args: GetPaperContentArgs) { try { // Construct the PDF URL directly // arXiv PDF URLs follow the pattern: https://arxiv.org/pdf/{paper_id}.pdf const pdfUrl = `https://arxiv.org/pdf/${args.paper_id}.pdf`; // Download the PDF const pdfPath = await this.downloadPdf(pdfUrl, args.paper_id); // Extract text from the PDF const textContent = await this.extractTextFromPdf(pdfPath); // Clean up the text (remove excessive whitespace, normalize line breaks) const cleanedText = textContent .replace(/\s+/g, ' ') .replace(/(\r\n|\n|\r)/gm, '\n') .trim(); // Return the extracted text return { content: [ { type: 'text', text: cleanedText, }, ], }; } catch (error) { console.error('Error in getPaperContent:', error); if (axios.isAxiosError(error)) { return { content: [ { type: 'text', text: `Error retrieving paper content: ${error.response?.data || error.message}`, }, ], isError: true, }; } return { content: [ { type: 'text', text: `Error processing paper content: ${error instanceof Error ? error.message : String(error)}`, }, ], isError: true, }; } } async run() { const transport = new StdioServerTransport(); await this.server.connect(transport); console.error('arXiv MCP server running on stdio'); } } const server = new ArxivServer(); server.run().catch(console.error);

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Mnehmos/arxiv-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server