Instagram MCP Investigator

scraper.js•11.8 kB

import { chromium } from "playwright"; import fs from "fs/promises"; import path from "path"; const envMaxPosts = Number.parseInt(process.env.MAX_POSTS ?? "", 10); const DEFAULT_MAX_POSTS = Number.isFinite(envMaxPosts) ? envMaxPosts : 12; const INSTAGRAM_WEB_APP_ID = process.env.IG_WEB_APP_ID || "936619743392459"; const numberFormatter = new Intl.NumberFormat("en-US"); function normaliseUsername(username) { return username.replace(/^@/, "").trim(); } function formatCount(value) { if (typeof value !== "number") { return null; } try { return numberFormatter.format(value); } catch { return `${value}`; } } function resolveHeadlessFlag(headlessOption) { if (typeof headlessOption === "boolean") { return headlessOption; } const envValue = process.env.PLAYWRIGHT_HEADLESS; if (envValue === undefined) { return true; } return envValue !== "false" && envValue !== "0"; } async function loadStorageState(storageStatePath) { if (!storageStatePath) { return undefined; } try { const data = await fs.readFile(storageStatePath, "utf-8"); return JSON.parse(data); } catch (error) { if (error.code === "ENOENT") { throw new Error( `Playwright storage state not found at ${storageStatePath}. Log in once manually and save the session.` ); } throw new Error(`Unable to load Playwright storage state: ${error.message}`); } } function extractPostFromEdge(node) { if (!node) { return null; } const captionEdge = node.edge_media_to_caption?.edges?.[0]?.node?.text ?? null; const timestamp = node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toISOString() : null; return { postUrl: node.shortcode ? `https://www.instagram.com/p/${node.shortcode}/` : null, imageUrl: node.display_url ?? null, altText: node.accessibility_caption ?? null, accessibilityCaption: node.accessibility_caption ?? null, timestamp, captionPreview: captionEdge, likeCount: node.edge_liked_by?.count ?? null, commentCount: node.edge_media_to_comment?.count ?? null, isVideo: node.is_video ?? false, videoUrl: node.is_video ? node.video_url ?? null : null, }; } function extractPostFromFeedItem(item) { if (!item) { return null; } const caption = item.caption?.text ?? null; const timestamp = item.taken_at ? new Date(item.taken_at * 1000).toISOString() : null; const isVideo = item.media_type === 2 || item.media_type === "2"; const code = item.code ?? item.carousel_media?.[0]?.code ?? null; const imageCandidate = item.image_versions2?.candidates?.[0]?.url || item.carousel_media?.[0]?.image_versions2?.candidates?.[0]?.url || null; const videoCandidate = isVideo ? item.video_versions?.[0]?.url || item.carousel_media?.find((media) => media.video_versions)?.video_versions?.[0]?.url || null : null; return { postUrl: code ? `https://www.instagram.com/p/${code}/` : null, imageUrl: imageCandidate, altText: item.accessibility_caption ?? null, accessibilityCaption: item.accessibility_caption ?? null, timestamp, captionPreview: caption, likeCount: item.like_count ?? null, commentCount: item.comment_count ?? null, isVideo: Boolean(isVideo), videoUrl: videoCandidate, }; } async function fetchProfileViaApi(context, username, limit, log) { const commonHeaders = { "X-IG-App-ID": INSTAGRAM_WEB_APP_ID, Accept: "application/json", Referer: `https://www.instagram.com/${username}/`, }; const profileUrl = `https://www.instagram.com/api/v1/users/web_profile_info/?username=${encodeURIComponent(username)}`; log?.(`Fetching profile JSON from ${profileUrl}`); const response = await context.request.get(profileUrl, { headers: commonHeaders }); if (response.status() === 404) { throw new Error(`Instagram profile ${username} not found (404).`); } if (!response.ok()) { throw new Error(`Instagram profile API responded with ${response.status()}`); } const payload = await response.json(); const user = payload?.data?.user; if (!user) { throw new Error("Instagram profile API returned no user data"); } const edges = user.edge_owner_to_timeline_media?.edges ?? []; let posts = edges.slice(0, limit).map(({ node }) => extractPostFromEdge(node)).filter(Boolean); if (posts.length === 0 && user.id) { const feedUrl = `https://www.instagram.com/api/v1/feed/user/${user.id}/?count=${Math.min(limit, 50)}`; log?.(`Primary profile info returned no posts. Fetching user feed: ${feedUrl}`); const feedResponse = await context.request.get(feedUrl, { headers: commonHeaders }); if (feedResponse.ok()) { const feedData = await feedResponse.json(); const items = feedData?.items ?? []; posts = items.slice(0, limit).map((item) => extractPostFromFeedItem(item)).filter(Boolean); log?.(`User feed returned ${posts.length} items.`); } else { log?.(`User feed request failed with status ${feedResponse.status()}`); } } return { profileUrl: `https://www.instagram.com/${username}/`, username: user.username ?? username, displayName: user.full_name || null, biography: user.biography || null, avatarUrl: user.profile_pic_url_hd || user.profile_pic_url || null, stats: { posts: formatCount(user.edge_owner_to_timeline_media?.count ?? null), followers: formatCount(user.edge_followed_by?.count ?? null), following: formatCount(user.edge_follow?.count ?? null), }, posts, id: user.id ?? null, isPrivate: user.is_private ?? null, }; } export async function scrapeProfile(username, options = {}) { if (!username) { throw new Error("Instagram username is required"); } const { headless, storageStatePath = process.env.PLAYWRIGHT_STORAGE_STATE ? path.resolve(process.env.PLAYWRIGHT_STORAGE_STATE) : path.resolve("storageState.json"), maxPosts = DEFAULT_MAX_POSTS, navigationTimeout = 60_000, postSelector = 'main a[href*="/p/"], main a[href*="/reel"], main a[href*="/reels/"]', waitUntil = process.env.PLAYWRIGHT_WAIT_UNTIL || "domcontentloaded", logger, } = options; const browserOptions = { headless: resolveHeadlessFlag(headless), }; if (process.env.PLAYWRIGHT_BROWSER_CHANNEL) { browserOptions.channel = process.env.PLAYWRIGHT_BROWSER_CHANNEL; } const log = (message) => { if (typeof logger === "function") { logger(message); } }; const normalisedUsername = normaliseUsername(username); log(`Launching Chromium (headless=${browserOptions.headless})`); const browser = await chromium.launch(browserOptions); let context; try { log(`Creating browser context using storageState: ${storageStatePath}`); context = await browser.newContext({ viewport: { width: 1365, height: 768 }, storageState: await loadStorageState(storageStatePath), }); const page = await context.newPage(); const profileUrl = `https://www.instagram.com/${normalisedUsername}/`; log(`Navigating to ${profileUrl}`); log(`Waiting for navigation (${waitUntil})`); await page.goto(profileUrl, { waitUntil, timeout: navigationTimeout, }); log(`Arrived at ${page.url()}`); if (page.url().includes("/accounts/login")) { throw new Error( "Instagram redirected to the login page. Verify storageState.json contains a logged-in session." ); } log(`Collecting up to ${maxPosts} posts`); let profileData; try { profileData = await fetchProfileViaApi(context, normalisedUsername, maxPosts, log); log( `Profile data fetched via Instagram web API (${profileData.posts.length} posts returned)` ); if (profileData.posts.length > 0) { profileData.posts.forEach((post, index) => { log(`post[${index + 1}] ${JSON.stringify(post)}`); }); } } catch (apiError) { log(`Web API fetch failed, will try DOM scrape: ${apiError.message}`); } if (!profileData || profileData.posts.length === 0) { if (profileData?.posts?.length === 0) { log("API returned zero posts; falling back to DOM scrape to double-check."); } log("Waiting for post feed to render (DOM scrape fallback)"); try { await page.waitForSelector("article", { timeout: 20_000 }); } catch (waitError) { log(`Timed out waiting for article element: ${waitError.message}`); } profileData = await page.evaluate( ({ limit, selector }) => { const ogDescription = document.querySelector('meta[property="og:description"]')?.content ?? ""; const ogTitle = document.querySelector('meta[property="og:title"]')?.content ?? ""; const profileImage = document.querySelector('meta[property="og:image"]')?.content ?? ""; const canonicalLink = document.querySelector('link[rel="canonical"]')?.href ?? location.href; const statsEntries = ogDescription .split(" - ")[0] .split(", ") .map((item) => item.trim()) .filter(Boolean); const stats = {}; for (const entry of statsEntries) { const [count, ...labelParts] = entry.split(" "); if (count && labelParts.length > 0) { stats[labelParts.join(" ").toLowerCase()] = count; } } const seen = new Set(); const posts = Array.from(document.querySelectorAll(selector)) .filter((link) => { const href = link.getAttribute("href") ?? ""; const match = /\/(p|reel|reels)\//.test(href); const absolute = href.startsWith("http") ? href : `${location.origin}${href}`; if (!match || seen.has(absolute)) { return false; } seen.add(absolute); return true; }) .slice(0, limit) .map((link) => { const img = link.querySelector("img"); const timeEl = link.querySelector("time"); const href = link.getAttribute("href") ?? ""; const captionPreview = link.querySelector("img")?.getAttribute("alt") ?? null; return { postUrl: href.startsWith("http") ? href : `${location.origin}${href}`, imageUrl: img?.currentSrc ?? img?.src ?? null, altText: img?.alt ?? null, accessibilityCaption: link.getAttribute("aria-label") ?? null, timestamp: timeEl?.getAttribute("datetime") ?? null, captionPreview, }; }); const bioCandidate = Array.from(document.querySelectorAll("header section div")) .map((node) => node.textContent?.trim()) .filter((text) => text) .slice(-1)[0] ?? null; return { profileUrl: canonicalLink, username: ogTitle.split(" (@")[1]?.replace(")", "") ?? null, displayName: ogTitle.split(" (@")[0] ?? null, biography: bioCandidate, avatarUrl: profileImage || null, stats, posts, }; }, { limit: maxPosts, selector: postSelector } ); } if (!profileData || profileData.posts.length === 0) { log(`No posts were collected for ${normalisedUsername}`); } else { log(`Scraped ${profileData.posts.length} posts for ${profileData.username ?? normalisedUsername}`); } return { scrapedAt: new Date().toISOString(), username, profile: profileData, }; } finally { log("Closing browser context"); await context?.close(); await browser.close(); } }

Latest Blog Posts

OpenTelemetry for Model Context Protocol (MCP) Analytics and Agent Observability
By Om-Shree-0709 on .
observability
mcp
opentelemetry
Securing Enterprise AI Agents with Unique Identities in the Model Context Protocol (MCP)
By Om-Shree-0709 on .
When Your Year of Work Gets Copied Overnight: What Actually Matters?
By punkpeye on .
startups

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/inoue2002/instagram-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server