import { z } from 'zod';
import type { Platform, UnifiedPost, ScrapeResult } from '../types.js';
import { getPlatform } from '../platforms/index.js';
import { upsertPosts } from '../db/lance.js';
import { calculateBatchEngagement } from '../analysis/engagement.js';
// --- Tool Schemas ---
const baseSchema = {
max_results: z.number().optional().describe('Maximum number of results to return (default varies by platform)'),
};
export const scrapeSchemas = {
scrape_twitter: z.object({
query: z.string().describe('Search query, hashtag, or username to scrape'),
type: z.enum(['search', 'user']).optional().describe('Type of scrape: search tweets or user timeline'),
date_from: z.string().optional().describe('Start date (YYYY-MM-DD)'),
date_to: z.string().optional().describe('End date (YYYY-MM-DD)'),
...baseSchema,
}),
scrape_instagram: z.object({
urls: z.array(z.string()).optional().describe('Direct Instagram post or profile URLs'),
hashtags: z.array(z.string()).optional().describe('Hashtags to search'),
search: z.string().optional().describe('Search query'),
type: z.enum(['posts', 'reels', 'stories']).optional().describe('Content type to scrape'),
...baseSchema,
}),
scrape_tiktok: z.object({
profiles: z.array(z.string()).optional().describe('TikTok profile usernames or URLs'),
hashtags: z.array(z.string()).optional().describe('Hashtags to search'),
search: z.string().optional().describe('Search query'),
...baseSchema,
}),
scrape_youtube: z.object({
search: z.string().optional().describe('YouTube search query'),
channel_urls: z.array(z.string()).optional().describe('YouTube channel URLs to scrape'),
...baseSchema,
}),
scrape_linkedin: z.object({
profile_urls: z.array(z.string()).optional().describe('LinkedIn profile URLs'),
company_urls: z.array(z.string()).optional().describe('LinkedIn company page URLs'),
search: z.string().optional().describe('Search query for posts'),
...baseSchema,
}),
scrape_facebook: z.object({
page_urls: z.array(z.string()).optional().describe('Facebook page URLs to scrape'),
search: z.string().optional().describe('Search query'),
...baseSchema,
}),
scrape_reddit: z.object({
subreddits: z.array(z.string()).optional().describe('Subreddit names (e.g., "technology") or URLs'),
search: z.string().optional().describe('Reddit search query'),
urls: z.array(z.string()).optional().describe('Direct Reddit post/thread URLs'),
...baseSchema,
}),
};
// --- Tool Descriptions ---
export const scrapeDescriptions: Record<string, string> = {
scrape_twitter: 'Scrape tweets from Twitter/X. Search by query, hashtag, or user timeline. Results are saved to the local timeline database for later analysis.',
scrape_instagram: 'Scrape posts from Instagram. Search by URL, hashtag, or keyword. Results are saved to the local timeline database.',
scrape_tiktok: 'Scrape videos from TikTok. Search by profile, hashtag, or keyword. Results are saved to the local timeline database.',
scrape_youtube: 'Scrape videos from YouTube. Search by keyword or channel URL. Results are saved to the local timeline database.',
scrape_linkedin: 'Scrape posts from LinkedIn. Search by profile URL, company URL, or keyword. Results are saved to the local timeline database.',
scrape_facebook: 'Scrape posts from Facebook pages. Search by page URL or keyword. Results are saved to the local timeline database.',
scrape_reddit: 'Scrape posts from Reddit. Search by subreddit, keyword, or direct URL. Results are saved to the local timeline database.',
};
// --- Tool Handlers ---
const platformMap: Record<string, Platform> = {
scrape_twitter: 'twitter',
scrape_instagram: 'instagram',
scrape_tiktok: 'tiktok',
scrape_youtube: 'youtube',
scrape_linkedin: 'linkedin',
scrape_facebook: 'facebook',
scrape_reddit: 'reddit',
};
async function scrape(platform: Platform, params: Record<string, unknown>): Promise<ScrapeResult> {
const platformModule = getPlatform(platform);
const input = platformModule.buildInput(params);
const posts = await platformModule.scrape(input);
let savedToTimeline = false;
try {
const count = await upsertPosts(posts);
savedToTimeline = count > 0;
} catch (err) {
// Timeline save is best-effort — don't fail the scrape
console.error(`Failed to save to timeline: ${err}`);
}
const engagement = calculateBatchEngagement(posts);
const topPost = posts.length > 0
? posts.reduce((best, p) => {
const score = p.engagement.likes + p.engagement.comments + p.engagement.shares;
const bestScore = best.engagement.likes + best.engagement.comments + best.engagement.shares;
return score > bestScore ? p : best;
})
: null;
return {
posts,
stats: {
total: posts.length,
avgEngagement: engagement.rate,
topPost,
},
savedToTimeline,
};
}
export function getScrapeHandler(toolName: string) {
const platform = platformMap[toolName];
if (!platform) throw new Error(`Unknown scrape tool: ${toolName}`);
return async (params: Record<string, unknown>) => {
const result = await scrape(platform, params);
return {
content: [{
type: 'text' as const,
text: JSON.stringify(result, null, 2),
}],
};
};
}