twitterapi-io-mcp

scrape-all.cjs•21.8 KiB

#!/usr/bin/env node /** * TwitterAPI.io Complete Documentation Scraper v2.2 * Düzeltmeler: * - Path extraction bug fix (cURL'den doğru path çıkarma) * - İçerik truncation kaldırıldı * - Daha iyi parameter parsing * - Sitemap + internal link crawl ile otomatik link keşfi */ const https = require('https'); const fs = require('fs'); const path = require('path'); const SITE_SITEMAP_URL = 'https://twitterapi.io/sitemap.xml'; const BLOG_INDEX_URL = 'https://twitterapi.io/blog/'; const DOCS_SITEMAP_URL = 'https://docs.twitterapi.io/sitemap.xml'; const DOCS_ENDPOINT_PREFIX = 'https://docs.twitterapi.io/api-reference/endpoint/'; const ALLOWED_HOSTS = new Set(['twitterapi.io', 'docs.twitterapi.io']); const ASSET_EXTENSIONS = new Set([ 'css', 'js', 'mjs', 'cjs', 'map', 'png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'ico', 'woff', 'woff2', 'ttf', 'eot', 'pdf', 'zip', 'gz', 'tgz', 'xml', 'json' ]); const MAX_INTERNAL_CRAWL_PAGES = 250; const GUIDE_PAGE_KEYS = new Set([ 'pricing', 'qps_limits', 'tweet_filter_rules', 'changelog', 'readme', 'twitter_stream', 'introduction', 'authentication', ]); const BLOG_KEY_OVERRIDES = new Map([ ['twitter-api-pricing-2025', 'blog_pricing_2025'], ['twitter-analytics-api-guide', 'blog_analytics_guide'], ['apify-alternative-for-twitter', 'blog_apify_alternative'], ['build-twitter-apps-with-kiro-ai-ide', 'blog_kiro_ai'], ['resources-and-tools', 'blog_resources'], ['how-to-monitor-twitter-accounts-for-new-tweets-in-real-time', 'blog_monitor_tweets'], ['twitter-login-and-post-api-guide', 'blog_login_post_api_guide'], ]); function fetchPage(url) { return new Promise((resolve, reject) => { const client = url.startsWith('https') ? https : require('http'); client.get(url, (res) => { if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { const redirectedUrl = new URL(res.headers.location, url).toString(); return fetchPage(redirectedUrl).then(resolve).catch(reject); } if (res.statusCode >= 400) { res.resume(); return reject(new Error(`HTTP ${res.statusCode} for ${url}`)); } let data = ''; res.on('data', chunk => data += chunk); res.on('end', () => resolve(data)); res.on('error', reject); }).on('error', reject); }); } function decodeHtmlEntities(text) { return text .replace(/</g, '<') .replace(/>/g, '>') .replace(/'/g, "'") .replace(/"/g, '"') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/'/g, "'") .replace(/ /g, ' '); } function extractSitemapLocs(xml) { return [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)] .map(m => m[1].trim()) .filter(Boolean); } function normalizeKey(input) { return input .toLowerCase() .replace(/[^a-z0-9]+/g, '_') .replace(/^_+|_+$/g, '') .replace(/_+/g, '_'); } function stripSlashes(value) { return value.replace(/^\/+|\/+$/g, ''); } function pageKeyFromUrl(url) { const parsed = new URL(url); const clean = stripSlashes(parsed.pathname); // Root pages if (!clean) { return parsed.hostname === 'docs.twitterapi.io' ? 'docs_home' : 'home'; } // Site blog index if (parsed.hostname === 'twitterapi.io' && clean === 'blog') { return 'blog_index'; } const baseKey = normalizeKey(clean.replace(/\//g, '_')); // Avoid key collisions between site and docs pages if (parsed.hostname === 'docs.twitterapi.io') { if (baseKey === 'authentication' || baseKey === 'introduction') return baseKey; return `docs_${baseKey}`; } return baseKey; } function blogKeyFromUrl(url) { const { pathname } = new URL(url); const clean = stripSlashes(pathname); if (!clean || clean === 'blog') return 'blog_index'; const slug = clean.startsWith('blog/') ? clean.slice('blog/'.length) : clean; const override = BLOG_KEY_OVERRIDES.get(slug); if (override) return override; return `blog_${normalizeKey(slug)}`; } function categoryForPageKey(key, host) { if (host === 'docs.twitterapi.io') return 'docs'; if (key === 'blog_index') return 'blog'; if (GUIDE_PAGE_KEYS.has(key)) return 'guide'; return 'info'; } function canonicalizeUrlForScrape(rawUrl, baseUrl = null) { const value = (rawUrl || '').trim(); if (!value) return null; if (value.startsWith('#')) return null; if (/^(mailto:|tel:|javascript:|data:)/i.test(value)) return null; let parsed; try { parsed = baseUrl ? new URL(value, baseUrl) : new URL(value); } catch (_err) { return null; } if (parsed.protocol === 'http:') parsed.protocol = 'https:'; if (parsed.protocol !== 'https:') return null; if (!ALLOWED_HOSTS.has(parsed.hostname)) return null; parsed.hash = ''; parsed.search = ''; if (parsed.pathname !== '/' && parsed.pathname.endsWith('/')) { parsed.pathname = parsed.pathname.slice(0, -1); } return parsed.toString(); } function isLikelyHtmlUrl(url) { try { const parsed = new URL(url); const pathname = parsed.pathname.toLowerCase(); if (pathname.includes('/_next/')) return false; if (pathname === '/favicon.ico') return false; if (parsed.hostname === 'twitterapi.io' && (pathname === '/api' || pathname.startsWith('/api/'))) return false; const extMatch = pathname.match(/\.([a-z0-9]+)$/); if (extMatch && ASSET_EXTENSIONS.has(extMatch[1])) return false; return true; } catch (_err) { return false; } } function stripTags(fragment) { return decodeHtmlEntities( fragment .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim() ); } function stripTagsPreserveWhitespace(fragment) { return decodeHtmlEntities( fragment .replace(/<[^>]+>/g, '') .replace(/\r/g, '') .trim() ); } function discoverInternalLinksFromHtml(html, baseUrl) { const urls = new Set(); const matches = html.matchAll(/href\s*=\s*["']([^"']+)["']/gi); for (const m of matches) { const canonical = canonicalizeUrlForScrape(m[1], baseUrl); if (!canonical) continue; if (!isLikelyHtmlUrl(canonical)) continue; urls.add(canonical); } return [...urls]; } function discoverBlogUrlsFromIndex(html) { const urls = new Set(); const matches = html.matchAll(/href=["'](\/blog\/[^"'?#]+)\/?["']/gi); for (const m of matches) { const path = m[1].replace(/\/+$/g, ''); if (path === '/blog') continue; const canonical = canonicalizeUrlForScrape(`https://twitterapi.io${path}`); if (canonical) urls.add(canonical); } return [...urls]; } async function discoverScrapeTargets() { // twitterapi.io (marketing + blog) const siteXml = await fetchPage(SITE_SITEMAP_URL); const siteUrls = new Set( extractSitemapLocs(siteXml) .map((u) => canonicalizeUrlForScrape(u)) .filter(Boolean) ); // Blog index often contains more posts than sitemap try { const blogIndexHtml = await fetchPage(BLOG_INDEX_URL); for (const u of discoverBlogUrlsFromIndex(blogIndexHtml)) { siteUrls.add(u); } } catch (_err) { // Non-fatal: fall back to sitemap + overrides } // Ensure legacy/known posts remain included even if not discoverable for (const slug of BLOG_KEY_OVERRIDES.keys()) { const canonical = canonicalizeUrlForScrape(`https://twitterapi.io/blog/${slug}`); if (canonical) siteUrls.add(canonical); } // Ensure key site pages are present even if they aren't linked in the sitemap. // These pages are referenced by endpoint docs and are useful for MCP users. for (const url of ['https://twitterapi.io/twitter-stream']) { const canonical = canonicalizeUrlForScrape(url); if (canonical) siteUrls.add(canonical); } const sitePages = []; const siteBlogs = []; // docs.twitterapi.io (API reference + docs pages) const docsXml = await fetchPage(DOCS_SITEMAP_URL); const docsUrls = new Set( extractSitemapLocs(docsXml) .map((u) => canonicalizeUrlForScrape(u)) .filter(Boolean) ); // Crawl internal links to expand coverage beyond sitemap const discoveryQueue = []; const visited = new Set(); // Seed discovery with non-endpoint pages only (avoid double-fetching all endpoints) for (const u of siteUrls) { if (isLikelyHtmlUrl(u)) discoveryQueue.push(u); } for (const u of docsUrls) { if (u.startsWith(DOCS_ENDPOINT_PREFIX)) continue; if (isLikelyHtmlUrl(u)) discoveryQueue.push(u); } while (discoveryQueue.length > 0 && visited.size < MAX_INTERNAL_CRAWL_PAGES) { const current = discoveryQueue.shift(); if (!current || visited.has(current)) continue; visited.add(current); let html; try { html = await fetchPage(current); } catch (_err) { continue; } for (const link of discoverInternalLinksFromHtml(html, current)) { if (visited.has(link)) continue; const parsed = new URL(link); if (parsed.hostname === 'docs.twitterapi.io' && parsed.pathname === '/') continue; if (parsed.hostname === 'twitterapi.io') { siteUrls.add(link); } else if (parsed.hostname === 'docs.twitterapi.io') { docsUrls.add(link); } // Avoid crawling endpoint pages (already covered by sitemap) if (link.startsWith(DOCS_ENDPOINT_PREFIX)) continue; // Avoid crawling blog posts deeply; the blog index is the source of truth if (parsed.hostname === 'twitterapi.io' && parsed.pathname.startsWith('/blog/') && parsed.pathname !== '/blog') { continue; } if (discoveryQueue.length + visited.size >= MAX_INTERNAL_CRAWL_PAGES) continue; discoveryQueue.push(link); } } // Build final categorized lists after discovery for (const u of siteUrls) { const parsed = new URL(u); if (parsed.hostname !== 'twitterapi.io') continue; const isBlogPost = parsed.pathname.startsWith('/blog/') && parsed.pathname !== '/blog'; if (isBlogPost) { const name = blogKeyFromUrl(u); siteBlogs.push({ url: u, name, category: 'blog' }); } else { const name = pageKeyFromUrl(u); sitePages.push({ url: u, name, category: categoryForPageKey(name, parsed.hostname) }); } } const endpoints = new Set(); const docsPages = []; for (const u of docsUrls) { if (!u.startsWith('https://docs.twitterapi.io/')) continue; const parsed = new URL(u); if (parsed.pathname === '/') continue; if (u.startsWith(DOCS_ENDPOINT_PREFIX)) { const slug = u.slice(DOCS_ENDPOINT_PREFIX.length).replace(/\/+$/g, ''); if (slug) endpoints.add(slug); } else { const name = pageKeyFromUrl(u); docsPages.push({ url: u, name, category: 'docs' }); } } // Stable ordering sitePages.sort((a, b) => a.name.localeCompare(b.name)); siteBlogs.sort((a, b) => a.name.localeCompare(b.name)); docsPages.sort((a, b) => a.name.localeCompare(b.name)); const endpointsSorted = [...endpoints].sort(); return { sitePages, siteBlogs, docsPages, endpoints: endpointsSorted }; } function extractMainSiteContent(html, pageName) { const result = { name: pageName, scraped_at: new Date().toISOString(), type: 'page' }; // Title const titleMatch = html.match(/<title>([^<]+)<\/title>/i); if (titleMatch) result.title = decodeHtmlEntities(titleMatch[1].trim()); // Meta description const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"/i); if (descMatch) result.description = decodeHtmlEntities(descMatch[1]); // H1, H2, H3 headers const headers = []; for (const m of html.matchAll(/<h([1-3])[^>]*>([\s\S]*?)<\/h\1>/gi)) { const level = Number(m[1]); const text = stripTags(m[2]); if (text) headers.push({ level, text }); } if (headers.length) result.headers = headers; // Paragraphs - TÜM paragrafları al const paragraphs = []; for (const m of html.matchAll(/<p[^>]*>([\s\S]*?)<\/p>/gi)) { const text = stripTags(m[1]); if (text.length > 10) paragraphs.push(text); } if (paragraphs.length) result.paragraphs = paragraphs; // Limit kaldırıldı // Pre blocks (büyük kod blokları) const preBlocks = []; for (const m of html.matchAll(/<pre[^>]*>([\s\S]*?)<\/pre>/gi)) { const text = stripTagsPreserveWhitespace(m[1]); if (text.length > 10) preBlocks.push(text); } if (preBlocks.length) result.pre_blocks = preBlocks; if (preBlocks.length) result.code_snippets = preBlocks; // Lists (li items) - TÜM list itemları al const listItems = []; for (const m of html.matchAll(/<li[^>]*>([\s\S]*?)<\/li>/gi)) { const text = stripTags(m[1]); if (text.length > 3) listItems.push(text); } if (listItems.length) result.list_items = listItems; // Limit kaldırıldı // Raw text extraction - TAM metin, truncation YOK result.raw_text = decodeHtmlEntities(html .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '') .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '') .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim()); return result; } function extractEndpointContent(html, endpointName) { const result = { name: endpointName, url: `https://docs.twitterapi.io/api-reference/endpoint/${endpointName}`, scraped_at: new Date().toISOString(), type: 'endpoint' }; // Title const titleMatch = html.match(/<title>([^<]+)<\/title>/i); if (titleMatch) result.title = decodeHtmlEntities(titleMatch[1].replace(' – Docs', '').trim()); // Meta description const descMatch = html.match(/<meta[^>]*name="description"[^>]*content="([^"]+)"/i); if (descMatch) result.description = decodeHtmlEntities(descMatch[1]); // ========== BUG FIX v2.1: Path çıkarma - Mintlify HTML formatı ========== // Mintlify, Next.js SSR kullanıyor ve içerik span tag'leri içinde syntax-highlighted // Yöntem 1: JSON data içinden path çıkar (en güvenilir) // Format: "path":"/twitter/user/info" const jsonPathMatch = html.match(/"path"\s*:\s*"(\/twitter\/[^"]+)"/i); if (jsonPathMatch) { result.path = jsonPathMatch[1].trim(); } // Yöntem 2: Span içindeki URL'den çıkar // Format: https://api.twitterapi.io/twitter/user/info</span> // veya: https://api.twitterapi.io/oapi/my/info</span> if (!result.path) { const spanUrlMatch = html.match(/https:\/\/api\.twitterapi\.io(\/(?:twitter|oapi)\/[^<\s"'\\]+)/i); if (spanUrlMatch) { result.path = spanUrlMatch[1].replace(/\\$/, '').trim(); } } // Yöntem 3: Plain text URL (fallback) if (!result.path) { const plainUrlMatch = html.match(/--url\s+https?:\/\/api\.twitterapi\.io([^\s\\'"<]+)/i); if (plainUrlMatch) { result.path = plainUrlMatch[1].replace(/\\$/, '').trim(); } } // Method'u bul - JSON veya text'ten // JSON format: "method":"GET" const jsonMethodMatch = html.match(/"method"\s*:\s*"(GET|POST|PUT|DELETE|PATCH)"/i); if (jsonMethodMatch) { result.method = jsonMethodMatch[1].toUpperCase(); } // Fallback: cURL'deki --request'ten if (!result.method) { const curlMethodMatch = html.match(/--request\s+(GET|POST|PUT|DELETE|PATCH)/i); if (curlMethodMatch) { result.method = curlMethodMatch[1].toUpperCase(); } } // Alternatif: Endpoint başlığından method ve path // Format: "GET / twitter / user / info" if (!result.path) { const endpointHeaderMatch = html.match(/(GET|POST|PUT|DELETE|PATCH)\s+\/\s*([a-z_\/\s]+)/i); if (endpointHeaderMatch) { if (!result.method) result.method = endpointHeaderMatch[1].toUpperCase(); result.path = '/' + endpointHeaderMatch[2].replace(/\s+/g, '').trim(); } } // ========== Query Parameters Çıkarma ========== const parameters = []; // Query Parameters section'ını bul const paramSection = html.match(/Query Parameters[\s\S]*?(?=Response|Authorizations|$)/i); if (paramSection) { // Parameter adlarını ve açıklamalarını çıkar const paramMatches = paramSection[0].matchAll(/(\w+)\s+string[^<]*(?:required)?[^<]*(?:<[^>]*>)*\s*([^<]+)/gi); for (const m of paramMatches) { const paramName = m[1].trim(); const paramDesc = decodeHtmlEntities(m[2].trim()); if (paramName && paramName !== 'string' && paramDesc.length > 5) { parameters.push({ name: paramName, description: paramDesc, required: paramSection[0].toLowerCase().includes(paramName.toLowerCase() + '" required') || paramSection[0].toLowerCase().includes(paramName.toLowerCase() + ' required') }); } } } if (parameters.length) result.parameters = parameters; // ========== Response Schema Çıkarma ========== const responseMatch = html.match(/Response\s+200[\s\S]*?(?=Response\s+\d{3}|Authorizations|$)/i); if (responseMatch) { const responseFields = []; const fieldMatches = responseMatch[0].matchAll(/(\w+)\.\s*(\w+)\s+(string|integer|boolean|object|array)/gi); for (const m of fieldMatches) { responseFields.push({ parent: m[1], field: m[2], type: m[3] }); } if (responseFields.length) result.response_fields = responseFields.slice(0, 50); } // Code blocks: prefer <pre> blocks (Mintlify code examples) const preBlocks = []; for (const m of html.matchAll(/<pre[^>]*>([\s\S]*?)<\/pre>/gi)) { const text = stripTagsPreserveWhitespace(m[1]); if (text.length > 10) preBlocks.push(text); } if (preBlocks.length) result.code_snippets = preBlocks; // cURL örneğini ayrıca sakla const curlFromPre = preBlocks.find((s) => /curl\s+--request/i.test(s)); if (curlFromPre) { result.curl_example = curlFromPre.replace(/\s+\n/g, '\n').trim(); } else { const curlExample = html.match(/curl\s+--request[^<]+/i); if (curlExample) { result.curl_example = decodeHtmlEntities(curlExample[0] .replace(/\\n/g, '\n') .replace(/\s+/g, ' ') .trim()); } } // Raw text - TAM içerik, truncation YOK result.raw_text = decodeHtmlEntities(html .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim()); return result; } async function scrapeAll() { console.log('🚀 TwitterAPI.io v2.2 Scraper - Tüm içerikler çekiliyor...\n'); console.log('🔎 Link keşfi (sitemap + blog index)...\n'); const targets = await discoverScrapeTargets(); const pagesToScrape = [...targets.sitePages, ...targets.docsPages]; const blogsToScrape = targets.siteBlogs; const endpointsToScrape = targets.endpoints; const docs = { meta: { source: 'https://twitterapi.io + https://docs.twitterapi.io', scraped_at: new Date().toISOString(), version: '2.2', total_endpoints: endpointsToScrape.length, total_pages: pagesToScrape.length, total_blogs: blogsToScrape.length }, authentication: { header: 'x-api-key', header_value: 'YOUR_API_KEY', base_url: 'https://api.twitterapi.io', dashboard_url: 'https://twitterapi.io/dashboard' }, qps_limits: { free: '1 request per 5 seconds', paid: { '1000_credits': '3 QPS', '5000_credits': '6 QPS', '10000_credits': '10 QPS', '50000_credits': '20 QPS' } }, pricing: { credits_per_usd: 100000, costs: { tweets: '15 credits per tweet', profiles: '18 credits per user', followers: '15 credits per follower', list_calls: '150 credits per call' }, minimum_charge: '15 credits ($0.00015) per request' }, endpoints: {}, pages: {}, blogs: {} }; console.log(`📄 Sayfalar çekiliyor... (${pagesToScrape.length})\n`); for (const page of pagesToScrape) { process.stdout.write(` ${page.name}... `); try { const html = await fetchPage(page.url); const content = extractMainSiteContent(html, page.name); content.url = page.url; content.category = page.category; docs.pages[page.name] = content; console.log('✅'); } catch (err) { console.log('❌', err.message); } await new Promise(r => setTimeout(r, 200)); } console.log(`\n📰 Blog yazıları çekiliyor... (${blogsToScrape.length})\n`); for (const blog of blogsToScrape) { process.stdout.write(` ${blog.name}... `); try { const html = await fetchPage(blog.url); const content = extractMainSiteContent(html, blog.name); content.url = blog.url; content.category = 'blog'; docs.blogs[blog.name] = content; console.log('✅'); } catch (err) { console.log('❌', err.message); } await new Promise(r => setTimeout(r, 200)); } console.log(`\n📚 API Endpoint dokümanları çekiliyor... (${endpointsToScrape.length})\n`); for (const endpoint of endpointsToScrape) { const url = `https://docs.twitterapi.io/api-reference/endpoint/${endpoint}`; process.stdout.write(` ${endpoint}... `); try { const html = await fetchPage(url); docs.endpoints[endpoint] = extractEndpointContent(html, endpoint); console.log('✅'); } catch (err) { console.log('❌', err.message); docs.endpoints[endpoint] = { name: endpoint, error: err.message, url }; } await new Promise(r => setTimeout(r, 150)); } const outPath = path.join(__dirname, 'data', 'docs.json'); fs.mkdirSync(path.dirname(outPath), { recursive: true }); // Ensure meta counts reflect successful scrapes docs.meta.total_endpoints = Object.keys(docs.endpoints).length; docs.meta.total_pages = Object.keys(docs.pages).length; docs.meta.total_blogs = Object.keys(docs.blogs).length; fs.writeFileSync(outPath, JSON.stringify(docs, null, 2)); const stats = { endpoints: Object.keys(docs.endpoints).length, pages: Object.keys(docs.pages).length, blogs: Object.keys(docs.blogs).length }; console.log(` ✅ Scraping tamamlandı! (v2.2) - ${stats.endpoints} endpoint - ${stats.pages} sayfa - ${stats.blogs} blog yazısı 📁 Kaydedildi: ${outPath} `); } scrapeAll().catch(console.error);

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dorukardahan/twitterapi-io-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scrape-all.cjs•21.8 KiB