Node.js Sandbox MCP Server

Overview Schema Related Servers Score Discussions

download-article.js

download-article.js•12.6 KiB

#!/usr/bin/env node /** * Script to download article HTML and extract/download all images * * Usage: node download-article.js <url> <image-prefix> * Example: node download-article.js https://example.com/article myprefix */ const https = require('https'); const http = require('http'); const fs = require('fs'); const path = require('path'); const { URL } = require('url'); const zlib = require('zlib'); // Get command line arguments const args = process.argv.slice(2); if (args.length < 2) { console.error('Usage: node download-article.js <url> <image-prefix>'); process.exit(1); } const articleUrl = args[0]; const imagePrefix = args[1]; // Directories const resourcesDir = path.join(__dirname, '..', 'resources'); const imagesDir = path.join(__dirname, '..', 'public', 'images', 'pillars'); // Ensure directories exist if (!fs.existsSync(resourcesDir)) { fs.mkdirSync(resourcesDir, { recursive: true }); } if (!fs.existsSync(imagesDir)) { fs.mkdirSync(imagesDir, { recursive: true }); } /** * Fetch HTML content from URL with redirect support */ function fetchHtml(url, maxRedirects = 5) { return new Promise((resolve, reject) => { if (maxRedirects === 0) { reject(new Error('Too many redirects')); return; } const urlObj = new URL(url); const client = urlObj.protocol === 'https:' ? https : http; const options = { hostname: urlObj.hostname, port: urlObj.port || (urlObj.protocol === 'https:' ? 443 : 80), path: urlObj.pathname + urlObj.search, method: 'GET', headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' } }; const req = client.request(options, (res) => { // Handle redirects if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { req.destroy(); const redirectUrl = new URL(res.headers.location, url).href; return fetchHtml(redirectUrl, maxRedirects - 1).then(resolve).catch(reject); } if (res.statusCode !== 200) { reject(new Error(`Failed to fetch: ${res.statusCode} ${res.statusMessage}`)); return; } // Handle compressed responses let stream = res; const encoding = res.headers['content-encoding']; if (encoding === 'gzip') { stream = res.pipe(zlib.createGunzip()); } else if (encoding === 'deflate') { stream = res.pipe(zlib.createInflate()); } else if (encoding === 'br') { stream = res.pipe(zlib.createBrotliDecompress()); } let data = ''; stream.on('data', (chunk) => { data += chunk; }); stream.on('end', () => { resolve(data); }); stream.on('error', (err) => { reject(err); }); }); req.on('error', (err) => { reject(err); }); req.end(); }); } /** * Decode HTML entities in URL */ function decodeHtmlEntities(str) { return str .replace(/&/g, '&') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/</g, '<') .replace(/>/g, '>') .replace(/ /g, ' '); } /** * Clean and normalize image URL */ function cleanImageUrl(url) { // Remove surrounding quotes url = url.trim().replace(/^["']|["']$/g, ''); // Decode HTML entities url = decodeHtmlEntities(url); // Remove any trailing quotes or spaces url = url.trim(); return url; } /** * Normalize image URL to absolute URL */ function normalizeImageUrl(imageUrl, baseUrl) { // Clean the URL first imageUrl = cleanImageUrl(imageUrl); // Skip data URLs (base64 encoded images) if (imageUrl.startsWith('data:')) { return null; } // Skip if empty or invalid if (!imageUrl || imageUrl.length === 0) { return null; } const baseUrlObj = new URL(baseUrl); // Convert relative URLs to absolute if (imageUrl.startsWith('//')) { imageUrl = baseUrlObj.protocol + imageUrl; } else if (imageUrl.startsWith('/')) { imageUrl = baseUrlObj.origin + imageUrl; } else if (!imageUrl.startsWith('http://') && !imageUrl.startsWith('https://')) { try { imageUrl = new URL(imageUrl, baseUrl).href; } catch (e) { return null; // Invalid URL } } // Validate the final URL try { new URL(imageUrl); return imageUrl; } catch (e) { return null; } } /** * Extract image URLs from HTML using regex */ function extractImageUrls(html, baseUrl) { const imageUrls = []; const baseUrlObj = new URL(baseUrl); // Regex patterns to find images in various formats const patterns = [ // img src attribute (with various quote styles) /<img[^>]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, /<img[^>]+src\s*=\s*([^\s>]+)[^>]*>/gi, // data-src (lazy loading) /<img[^>]+data-src\s*=\s*["']([^"']+)["'][^>]*>/gi, /<img[^>]+data-src\s*=\s*([^\s>]+)[^>]*>/gi, // srcset attribute (can contain multiple URLs) /<img[^>]+srcset\s*=\s*["']([^"']+)["'][^>]*>/gi, /<img[^>]+srcset\s*=\s*([^\s>]+)[^>]*>/gi, // picture source srcset /<source[^>]+srcset\s*=\s*["']([^"']+)["'][^>]*>/gi, /<source[^>]+srcset\s*=\s*([^\s>]+)[^>]*>/gi, // source src /<source[^>]+src\s*=\s*["']([^"']+)["'][^>]*>/gi, /<source[^>]+src\s*=\s*([^\s>]+)[^>]*>/gi, // background-image in style /style\s*=\s*["'][^"']*background-image\s*:\s*url$["']?([^"')]+)["']?$/gi, // background-image in CSS /background-image\s*:\s*url$["']?([^"')]+)["']?$/gi, // data-image or data-url /data-image\s*=\s*["']([^"']+)["']/gi, /data-url\s*=\s*["']([^"']+)["']/gi, ]; for (const pattern of patterns) { let match; while ((match = pattern.exec(html)) !== null) { let imageUrl = match[1]; // Handle srcset (can contain multiple URLs with descriptors like "url 2x" or "url 800w") if (imageUrl.includes(',')) { const urls = imageUrl.split(',').map(url => { // Remove descriptors (like "2x", "800w", etc.) return url.trim().split(/\s+/)[0]; }); urls.forEach(url => { const normalized = normalizeImageUrl(url, baseUrl); if (normalized) imageUrls.push(normalized); }); } else { const normalized = normalizeImageUrl(imageUrl, baseUrl); if (normalized) imageUrls.push(normalized); } } } // Remove duplicates and filter out non-image URLs const uniqueUrls = [...new Set(imageUrls)].filter(url => { // Filter out non-image file types const lowerUrl = url.toLowerCase(); return lowerUrl.match(/\.(jpg|jpeg|png|gif|webp|svg|bmp|tiff|ico)(\?|$|#)/i) || lowerUrl.includes('image') || !lowerUrl.match(/\.(html|css|js|json|xml|pdf)(\?|$|#)/i); }); return uniqueUrls; } /** * Get file extension from Content-Type header */ function getExtensionFromContentType(contentType) { const mimeTypes = { 'image/jpeg': '.jpg', 'image/jpg': '.jpg', 'image/png': '.png', 'image/gif': '.gif', 'image/webp': '.webp', 'image/svg+xml': '.svg', 'image/bmp': '.bmp', 'image/tiff': '.tiff' }; if (contentType) { const mimeType = contentType.split(';')[0].trim().toLowerCase(); return mimeTypes[mimeType] || '.jpg'; } return '.jpg'; } /** * Download an image from URL with redirect support and proper headers */ function downloadImage(url, filepath, maxRedirects = 5) { return new Promise((resolve, reject) => { if (maxRedirects === 0) { reject(new Error('Too many redirects')); return; } const urlObj = new URL(url); const client = urlObj.protocol === 'https:' ? https : http; const options = { hostname: urlObj.hostname, port: urlObj.port || (urlObj.protocol === 'https:' ? 443 : 80), path: urlObj.pathname + urlObj.search, method: 'GET', headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Referer': urlObj.origin + '/' } }; const file = fs.createWriteStream(filepath); const req = client.request(options, (res) => { // Handle redirects if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { file.close(); if (fs.existsSync(filepath)) { fs.unlinkSync(filepath); } const redirectUrl = new URL(res.headers.location, url).href; return downloadImage(redirectUrl, filepath, maxRedirects - 1).then(resolve).catch(reject); } if (res.statusCode !== 200) { file.close(); if (fs.existsSync(filepath)) { fs.unlinkSync(filepath); } reject(new Error(`Failed to download image: ${res.statusCode} ${res.statusMessage}`)); return; } // Handle compressed responses let stream = res; const encoding = res.headers['content-encoding']; if (encoding === 'gzip') { stream = res.pipe(zlib.createGunzip()); } else if (encoding === 'deflate') { stream = res.pipe(zlib.createInflate()); } else if (encoding === 'br') { stream = res.pipe(zlib.createBrotliDecompress()); } stream.pipe(file); file.on('finish', () => { file.close(); resolve(); }); stream.on('error', (err) => { file.close(); if (fs.existsSync(filepath)) { fs.unlinkSync(filepath); } reject(err); }); }); req.on('error', (err) => { file.close(); if (fs.existsSync(filepath)) { fs.unlinkSync(filepath); } reject(err); }); req.end(); }); } /** * Get file extension from URL */ function getExtensionFromUrl(url) { try { const urlObj = new URL(url); const pathname = urlObj.pathname; const ext = path.extname(pathname).toLowerCase(); // Common image extensions const validExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp', '.tiff', '.ico']; if (ext && validExtensions.includes(ext)) { return ext; } // Try to infer from URL path const lowerPath = pathname.toLowerCase(); if (lowerPath.includes('.png')) return '.png'; if (lowerPath.includes('.jpg') || lowerPath.includes('.jpeg')) return '.jpg'; if (lowerPath.includes('.gif')) return '.gif'; if (lowerPath.includes('.webp')) return '.webp'; if (lowerPath.includes('.svg')) return '.svg'; // Default to jpg return '.jpg'; } catch (e) { return '.jpg'; } } /** * Main function */ async function main() { try { console.log(`Fetching article from: ${articleUrl}`); // Fetch HTML const html = await fetchHtml(articleUrl); console.log(`✓ Fetched HTML (${html.length} bytes)`); // Generate filename from URL const urlObj = new URL(articleUrl); let filename = urlObj.pathname.split('/').filter(Boolean).pop() || 'index'; if (!filename.endsWith('.html')) { filename += '.html'; } // Clean filename (remove query params, etc.) filename = filename.replace(/[^a-zA-Z0-9._-]/g, '_'); if (!filename.endsWith('.html')) { filename += '.html'; } const htmlPath = path.join(resourcesDir, filename); // Save HTML fs.writeFileSync(htmlPath, html, 'utf8'); console.log(`✓ Saved HTML to: ${htmlPath}`); // Extract image URLs const imageUrls = extractImageUrls(html, articleUrl); console.log(`✓ Found ${imageUrls.length} images`); // Download images for (let i = 0; i < imageUrls.length; i++) { const imageUrl = imageUrls[i]; const ext = getExtensionFromUrl(imageUrl); const newFilename = `${imagePrefix}_${i + 1}${ext}`; const imagePath = path.join(imagesDir, newFilename); try { console.log(`Downloading image ${i + 1}/${imageUrls.length}: ${imageUrl}`); await downloadImage(imageUrl, imagePath); console.log(`✓ Saved: ${newFilename}`); } catch (err) { console.error(`✗ Failed to download ${imageUrl}: ${err.message}`); } } console.log('\n✓ Done!'); } catch (error) { console.error('Error:', error.message); process.exit(1); } } main();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/alfonsograziano/node-code-sandbox-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

download-article.js•12.6 KiB