scraper.js•11.8 kB
import { chromium } from "playwright";
import fs from "fs/promises";
import path from "path";
const envMaxPosts = Number.parseInt(process.env.MAX_POSTS ?? "", 10);
const DEFAULT_MAX_POSTS = Number.isFinite(envMaxPosts) ? envMaxPosts : 12;
const INSTAGRAM_WEB_APP_ID = process.env.IG_WEB_APP_ID || "936619743392459";
const numberFormatter = new Intl.NumberFormat("en-US");
function normaliseUsername(username) {
return username.replace(/^@/, "").trim();
}
function formatCount(value) {
if (typeof value !== "number") {
return null;
}
try {
return numberFormatter.format(value);
} catch {
return `${value}`;
}
}
function resolveHeadlessFlag(headlessOption) {
if (typeof headlessOption === "boolean") {
return headlessOption;
}
const envValue = process.env.PLAYWRIGHT_HEADLESS;
if (envValue === undefined) {
return true;
}
return envValue !== "false" && envValue !== "0";
}
async function loadStorageState(storageStatePath) {
if (!storageStatePath) {
return undefined;
}
try {
const data = await fs.readFile(storageStatePath, "utf-8");
return JSON.parse(data);
} catch (error) {
if (error.code === "ENOENT") {
throw new Error(
`Playwright storage state not found at ${storageStatePath}. Log in once manually and save the session.`
);
}
throw new Error(`Unable to load Playwright storage state: ${error.message}`);
}
}
function extractPostFromEdge(node) {
if (!node) {
return null;
}
const captionEdge = node.edge_media_to_caption?.edges?.[0]?.node?.text ?? null;
const timestamp = node.taken_at_timestamp ? new Date(node.taken_at_timestamp * 1000).toISOString() : null;
return {
postUrl: node.shortcode ? `https://www.instagram.com/p/${node.shortcode}/` : null,
imageUrl: node.display_url ?? null,
altText: node.accessibility_caption ?? null,
accessibilityCaption: node.accessibility_caption ?? null,
timestamp,
captionPreview: captionEdge,
likeCount: node.edge_liked_by?.count ?? null,
commentCount: node.edge_media_to_comment?.count ?? null,
isVideo: node.is_video ?? false,
videoUrl: node.is_video ? node.video_url ?? null : null,
};
}
function extractPostFromFeedItem(item) {
if (!item) {
return null;
}
const caption = item.caption?.text ?? null;
const timestamp = item.taken_at ? new Date(item.taken_at * 1000).toISOString() : null;
const isVideo = item.media_type === 2 || item.media_type === "2";
const code = item.code ?? item.carousel_media?.[0]?.code ?? null;
const imageCandidate = item.image_versions2?.candidates?.[0]?.url
|| item.carousel_media?.[0]?.image_versions2?.candidates?.[0]?.url
|| null;
const videoCandidate = isVideo
? item.video_versions?.[0]?.url
|| item.carousel_media?.find((media) => media.video_versions)?.video_versions?.[0]?.url
|| null
: null;
return {
postUrl: code ? `https://www.instagram.com/p/${code}/` : null,
imageUrl: imageCandidate,
altText: item.accessibility_caption ?? null,
accessibilityCaption: item.accessibility_caption ?? null,
timestamp,
captionPreview: caption,
likeCount: item.like_count ?? null,
commentCount: item.comment_count ?? null,
isVideo: Boolean(isVideo),
videoUrl: videoCandidate,
};
}
async function fetchProfileViaApi(context, username, limit, log) {
const commonHeaders = {
"X-IG-App-ID": INSTAGRAM_WEB_APP_ID,
Accept: "application/json",
Referer: `https://www.instagram.com/${username}/`,
};
const profileUrl = `https://www.instagram.com/api/v1/users/web_profile_info/?username=${encodeURIComponent(username)}`;
log?.(`Fetching profile JSON from ${profileUrl}`);
const response = await context.request.get(profileUrl, { headers: commonHeaders });
if (response.status() === 404) {
throw new Error(`Instagram profile ${username} not found (404).`);
}
if (!response.ok()) {
throw new Error(`Instagram profile API responded with ${response.status()}`);
}
const payload = await response.json();
const user = payload?.data?.user;
if (!user) {
throw new Error("Instagram profile API returned no user data");
}
const edges = user.edge_owner_to_timeline_media?.edges ?? [];
let posts = edges.slice(0, limit).map(({ node }) => extractPostFromEdge(node)).filter(Boolean);
if (posts.length === 0 && user.id) {
const feedUrl = `https://www.instagram.com/api/v1/feed/user/${user.id}/?count=${Math.min(limit, 50)}`;
log?.(`Primary profile info returned no posts. Fetching user feed: ${feedUrl}`);
const feedResponse = await context.request.get(feedUrl, { headers: commonHeaders });
if (feedResponse.ok()) {
const feedData = await feedResponse.json();
const items = feedData?.items ?? [];
posts = items.slice(0, limit).map((item) => extractPostFromFeedItem(item)).filter(Boolean);
log?.(`User feed returned ${posts.length} items.`);
} else {
log?.(`User feed request failed with status ${feedResponse.status()}`);
}
}
return {
profileUrl: `https://www.instagram.com/${username}/`,
username: user.username ?? username,
displayName: user.full_name || null,
biography: user.biography || null,
avatarUrl: user.profile_pic_url_hd || user.profile_pic_url || null,
stats: {
posts: formatCount(user.edge_owner_to_timeline_media?.count ?? null),
followers: formatCount(user.edge_followed_by?.count ?? null),
following: formatCount(user.edge_follow?.count ?? null),
},
posts,
id: user.id ?? null,
isPrivate: user.is_private ?? null,
};
}
export async function scrapeProfile(username, options = {}) {
if (!username) {
throw new Error("Instagram username is required");
}
const {
headless,
storageStatePath = process.env.PLAYWRIGHT_STORAGE_STATE
? path.resolve(process.env.PLAYWRIGHT_STORAGE_STATE)
: path.resolve("storageState.json"),
maxPosts = DEFAULT_MAX_POSTS,
navigationTimeout = 60_000,
postSelector = 'main a[href*="/p/"], main a[href*="/reel"], main a[href*="/reels/"]',
waitUntil = process.env.PLAYWRIGHT_WAIT_UNTIL || "domcontentloaded",
logger,
} = options;
const browserOptions = {
headless: resolveHeadlessFlag(headless),
};
if (process.env.PLAYWRIGHT_BROWSER_CHANNEL) {
browserOptions.channel = process.env.PLAYWRIGHT_BROWSER_CHANNEL;
}
const log = (message) => {
if (typeof logger === "function") {
logger(message);
}
};
const normalisedUsername = normaliseUsername(username);
log(`Launching Chromium (headless=${browserOptions.headless})`);
const browser = await chromium.launch(browserOptions);
let context;
try {
log(`Creating browser context using storageState: ${storageStatePath}`);
context = await browser.newContext({
viewport: { width: 1365, height: 768 },
storageState: await loadStorageState(storageStatePath),
});
const page = await context.newPage();
const profileUrl = `https://www.instagram.com/${normalisedUsername}/`;
log(`Navigating to ${profileUrl}`);
log(`Waiting for navigation (${waitUntil})`);
await page.goto(profileUrl, {
waitUntil,
timeout: navigationTimeout,
});
log(`Arrived at ${page.url()}`);
if (page.url().includes("/accounts/login")) {
throw new Error(
"Instagram redirected to the login page. Verify storageState.json contains a logged-in session."
);
}
log(`Collecting up to ${maxPosts} posts`);
let profileData;
try {
profileData = await fetchProfileViaApi(context, normalisedUsername, maxPosts, log);
log(
`Profile data fetched via Instagram web API (${profileData.posts.length} posts returned)`
);
if (profileData.posts.length > 0) {
profileData.posts.forEach((post, index) => {
log(`post[${index + 1}] ${JSON.stringify(post)}`);
});
}
} catch (apiError) {
log(`Web API fetch failed, will try DOM scrape: ${apiError.message}`);
}
if (!profileData || profileData.posts.length === 0) {
if (profileData?.posts?.length === 0) {
log("API returned zero posts; falling back to DOM scrape to double-check.");
}
log("Waiting for post feed to render (DOM scrape fallback)");
try {
await page.waitForSelector("article", { timeout: 20_000 });
} catch (waitError) {
log(`Timed out waiting for article element: ${waitError.message}`);
}
profileData = await page.evaluate(
({ limit, selector }) => {
const ogDescription = document.querySelector('meta[property="og:description"]')?.content ?? "";
const ogTitle = document.querySelector('meta[property="og:title"]')?.content ?? "";
const profileImage = document.querySelector('meta[property="og:image"]')?.content ?? "";
const canonicalLink = document.querySelector('link[rel="canonical"]')?.href ?? location.href;
const statsEntries = ogDescription
.split(" - ")[0]
.split(", ")
.map((item) => item.trim())
.filter(Boolean);
const stats = {};
for (const entry of statsEntries) {
const [count, ...labelParts] = entry.split(" ");
if (count && labelParts.length > 0) {
stats[labelParts.join(" ").toLowerCase()] = count;
}
}
const seen = new Set();
const posts = Array.from(document.querySelectorAll(selector))
.filter((link) => {
const href = link.getAttribute("href") ?? "";
const match = /\/(p|reel|reels)\//.test(href);
const absolute = href.startsWith("http") ? href : `${location.origin}${href}`;
if (!match || seen.has(absolute)) {
return false;
}
seen.add(absolute);
return true;
})
.slice(0, limit)
.map((link) => {
const img = link.querySelector("img");
const timeEl = link.querySelector("time");
const href = link.getAttribute("href") ?? "";
const captionPreview = link.querySelector("img")?.getAttribute("alt") ?? null;
return {
postUrl: href.startsWith("http") ? href : `${location.origin}${href}`,
imageUrl: img?.currentSrc ?? img?.src ?? null,
altText: img?.alt ?? null,
accessibilityCaption: link.getAttribute("aria-label") ?? null,
timestamp: timeEl?.getAttribute("datetime") ?? null,
captionPreview,
};
});
const bioCandidate = Array.from(document.querySelectorAll("header section div"))
.map((node) => node.textContent?.trim())
.filter((text) => text)
.slice(-1)[0] ?? null;
return {
profileUrl: canonicalLink,
username: ogTitle.split(" (@")[1]?.replace(")", "") ?? null,
displayName: ogTitle.split(" (@")[0] ?? null,
biography: bioCandidate,
avatarUrl: profileImage || null,
stats,
posts,
};
},
{ limit: maxPosts, selector: postSelector }
);
}
if (!profileData || profileData.posts.length === 0) {
log(`No posts were collected for ${normalisedUsername}`);
} else {
log(`Scraped ${profileData.posts.length} posts for ${profileData.username ?? normalisedUsername}`);
}
return {
scrapedAt: new Date().toISOString(),
username,
profile: profileData,
};
} finally {
log("Closing browser context");
await context?.close();
await browser.close();
}
}