extract_scholar
Extract research results from Google Scholar search URLs to get titles, authors, publication years, and snippets with timestamps for data freshness awareness.
Instructions
Extract research results from a Google Scholar search URL. Returns titles, authors, publication years, and snippets — all timestamped.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | Google Scholar search URL e.g. https://scholar.google.com/scholar?q=... | |
| max_length | No |
Implementation Reference
- src/adapters/scholar.ts:5-69 (handler)The core logic for extracting research results from Google Scholar using Playwright.
export async function scholarAdapter(options: ExtractOptions): Promise<AdapterResult> { const safeUrl = validateUrl(options.url, "scholar"); options = { ...options, url: safeUrl }; const browser = await chromium.launch({ headless: true }); const page = await browser.newPage(); await page.setExtraHTTPHeaders({ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", }); await page.goto(options.url, { waitUntil: "domcontentloaded", timeout: 20000 }); const data = await page.evaluate(`(function() { var items = Array.from(document.querySelectorAll('.gs_r.gs_or.gs_scl')); var results = items.map(function(el) { var titleEl = el.querySelector('.gs_rt'); var title = titleEl ? titleEl.textContent.trim() : null; var authorsEl = el.querySelector('.gs_a'); var authors = authorsEl ? authorsEl.textContent.trim() : null; var snippetEl = el.querySelector('.gs_rs'); var snippet = snippetEl ? snippetEl.textContent.trim() : null; var linkEl = el.querySelector('.gs_rt a'); var link = linkEl ? linkEl.getAttribute('href') : null; var yearMatch = authors ? authors.match(/\\b(19|20)\\d{2}\\b/) : null; var year = yearMatch ? yearMatch[0] : null; return { title: title, authors: authors, snippet: snippet, link: link, year: year }; }); return results; })()`); await browser.close(); const typedData = data as Array<{ title: string | null; authors: string | null; snippet: string | null; link: string | null; year: string | null }>; if (!typedData.length) { return { raw: "No results found on this Scholar page.", content_date: null, freshness_confidence: "low", }; } const raw = typedData .map((r, i) => [ `[${i + 1}] ${r.title ?? "Untitled"}`, `Authors: ${r.authors ?? "Unknown"}`, `Year: ${r.year ?? "Unknown"}`, `Snippet: ${r.snippet ?? "N/A"}`, `Link: ${r.link ?? "N/A"}`, ].join("\n") ) .join("\n\n"); const years = typedData.map((r) => r.year).filter(Boolean) as string[]; const newestYear = years.sort().reverse()[0] ?? null; return { raw, content_date: newestYear ? `${newestYear}-01-01` : null, freshness_confidence: newestYear ? "high" : "low", }; } - src/server.ts:52-70 (registration)Registration of the 'extract_scholar' tool and its associated schema and handler wrapper.
server.registerTool( "extract_scholar", { description: "Extract research results from a Google Scholar search URL. Returns titles, authors, publication years, and snippets — all timestamped.", inputSchema: z.object({ url: z.string().url().describe("Google Scholar search URL e.g. https://scholar.google.com/scholar?q=..."), max_length: z.number().optional().default(6000), }), annotations: { readOnlyHint: true, openWorldHint: true }, }, async ({ url, max_length }) => { try { const result = await scholarAdapter({ url, maxLength: max_length }); const ctx = stampFreshness(result, { url, maxLength: max_length }, "google_scholar"); return { content: [{ type: "text", text: formatForLLM(ctx) }] }; } catch (err) { return { content: [{ type: "text", text: formatSecurityError(err) }] }; }