MCP Job Search

scan-helpers.js•8.96 KiB

import { launch } from "@cloudflare/playwright"; import { autoSendDigest, sendScanFailureNotification } from './digest.js'; import { httpPerformDeepScan } from './http-deep-scan.js'; import { SCAN_CONFIG } from './constants.js'; // Generate a unique job ID from URL export function generateJobId(jobUrl) { if (!jobUrl) return null; // Extract job ID from LinkedIn URL (e.g., /view/123456/) const match = jobUrl.match(/\/view\/(\d+)\//); return match ? match[1] : jobUrl.split('/').pop().split('?')[0]; } // Store jobs for later deep scanning export async function storeJobsForDeepScan(env, jobs) { try { // Get existing job index const existingJobs = await env.JOB_STORAGE.get('job_index', 'json') || { jobs: [] }; // Add new jobs, avoiding duplicates const existingIds = new Set(existingJobs.jobs.map(j => j.id)); const newJobs = jobs.filter(job => job.id && !existingIds.has(job.id)); if (newJobs.length > 0) { existingJobs.jobs.push(...newJobs); existingJobs.lastUpdate = new Date().toISOString(); await env.JOB_STORAGE.put('job_index', JSON.stringify(existingJobs)); console.log(`Stored ${newJobs.length} new jobs for deep scanning`); } } catch (error) { console.error('Error storing jobs for deep scan:', error); } } // Main scan function export async function runScan(agent, url, options = {}) { const { sendDigest = true } = options; let browser = null; try { let urlsToProcess = []; console.log('--- runScan invoked ---'); if (url) { urlsToProcess.push({ url }); // Match the structure of plan.searchUrls } else { const plan = await agent.env.JOB_STORAGE.get('plan', 'json'); if (!plan || !plan.searchUrls || plan.searchUrls.length === 0) { throw new Error('No URL provided and no searches found in the current plan.'); } urlsToProcess = plan.searchUrls; console.log('URLs to scan:', urlsToProcess.map(u => u.url)); console.log(`Loaded plan with ${plan.searchUrls.length} search URLs`); } agent.backgroundJobs.scan.status = 'running'; agent.backgroundJobs.scan.urlsToScan = urlsToProcess.map(u => u.url); // Create browser instance for authenticated LinkedIn search (still needed for search results) // Note: Deep scan phase now uses HTTP requests instead of browser console.log('Launching browser for authenticated LinkedIn search phase...'); browser = await launch(agent.env.BROWSER); const page = await browser.newPage(); // Block unnecessary resources to improve performance and reduce bandwidth await page.route('**/*', (route) => { const resourceType = route.request().resourceType(); const url = route.request().url(); // Block images, stylesheets, fonts, and media files if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) { route.abort(); } // Block common tracking and analytics scripts else if (url.includes('google-analytics') || url.includes('googletagmanager') || url.includes('facebook.com') || url.includes('doubleclick') || url.includes('ads') || url.includes('analytics')) { route.abort(); } else { route.continue(); } }); console.log('Resource blocking configured for improved performance'); // Login once at the beginning of the scan. console.log('Navigating to LinkedIn login page...'); await page.goto('https://www.linkedin.com/login', { waitUntil: 'domcontentloaded' }); console.log('Entering login credentials...'); await page.type('#username', agent.env.LINKEDIN_EMAIL); await page.type('#password', agent.env.LINKEDIN_PASSWORD); console.log('Submitting login form...'); await page.click('button[type="submit"]'); console.log('Waiting for login to complete...'); await page.waitForNavigation({ waitUntil: 'networkidle' }).catch(e => console.log('Navigation timeout after login, continuing...')); // Check for security verification right after login attempt const postLoginUrl = page.url(); if (postLoginUrl.includes('checkpoint') || postLoginUrl.includes('security-verification')) { console.log(`LinkedIn security check detected at ${postLoginUrl}. The scraper may fail.`); agent.backgroundJobs.scan.error = 'LinkedIn security check detected. Manual login in a browser may be required.'; // It's probably not useful to continue if we hit a checkpoint. throw new Error(agent.backgroundJobs.scan.error); } for (const scanUrl of urlsToProcess) { console.log(`Navigating to job search URL: ${scanUrl.url}`); await page.goto(scanUrl.url, { waitUntil: 'domcontentloaded' }); const pageTitle = await page.title(); const pageUrl = page.url(); console.log(`Landed on page: "${pageTitle}" at URL: ${pageUrl}`); try { // 1. Wait for the header to ensure the page is ready. await page.waitForSelector('.jobs-search-results-list__header', { timeout: SCAN_CONFIG.PAGE_TIMEOUT }); // 2. Use the user-provided selector for job cards. const jobSelector = '.job-card-list'; const jobs = await page.$$eval(jobSelector, (els) => { // 3. Use the new data extraction logic based on the user's HTML. return els.map(el => { const titleEl = el.querySelector('a.job-card-list__title--link'); const companyEl = el.querySelector('.artdeco-entity-lockup__subtitle span'); // The location is in the first list item of the metadata. const locationEl = el.querySelector('.job-card-container__metadata-wrapper li'); return { title: titleEl?.innerText.trim() || null, company: companyEl?.innerText.trim() || null, location: locationEl?.innerText.trim().replace(/\n/g, ' ').replace(/\s+/g, ' ').trim() || null, url: titleEl?.href ? titleEl.href.split('?')[0] : null, }; }); }); console.log(`Found ${jobs.length} jobs on this page.`); agent.backgroundJobs.scan.totalJobsFound += jobs.length; // Store jobs for deep scanning if (jobs.length > 0) { const jobsWithId = jobs.map(job => ({ ...job, id: generateJobId(job.url), searchUrl: scanUrl.url, scanned: false, scanDate: null, matchScore: null })); // Store jobs in KV for later deep scan await storeJobsForDeepScan(agent.env, jobsWithId); } } catch (selectorError) { console.log(`Could not find job list using the new selectors: ${selectorError.message}`); agent.backgroundJobs.scan.error = `Failed to find job list on page. The layout may have changed.`; } agent.backgroundJobs.scan.scannedUrls.push(scanUrl.url); console.log('Continuing to next step after trying to scrape...'); } // After all search URLs are processed, start deep scan phase console.log('Starting HTTP-based deep scan phase (no browser needed)...'); agent.backgroundJobs.scan.status = 'deep_scanning'; // Use HTTP-based deep scan instead of browser-based approach await httpPerformDeepScan(agent); // Set status to completed before closing the browser to ensure state is updated. agent.backgroundJobs.scan.status = 'completed'; agent.backgroundJobs.scan.endTime = new Date().toISOString(); agent.backgroundJobs.scan.inProgress = false; console.log('Scan completed successfully'); // Auto-send digest if requested if (sendDigest) { const digestResult = await autoSendDigest(agent.env, { source: 'scan' }); if (digestResult.success) { console.log(`Auto-digest sent successfully: ${digestResult.jobsSent} jobs`); } else { console.log(`Auto-digest failed: ${digestResult.error}`); } } } catch (error) { console.error('Error in runScan:', error); agent.backgroundJobs.scan.error = error.message; agent.backgroundJobs.scan.status = 'failed'; agent.backgroundJobs.scan.inProgress = false; agent.backgroundJobs.scan.endTime = new Date().toISOString(); // Send failure notification email unless sendDigest is false if (sendDigest) { try { const failureResult = await sendScanFailureNotification(agent.env, error.message); if (failureResult.success) { console.log('Failure notification sent successfully'); } else { console.log(`Failed to send failure notification: ${failureResult.error}`); } } catch (digestError) { console.error('Error sending failure notification:', digestError); } } } finally { // Ensure browser is always closed, even on error if (browser) { try { console.log('Closing browser...'); await browser.close(); console.log('Browser closed successfully'); } catch (closeError) { console.error('Error closing browser:', closeError.message); } } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/adamd9/mcp-jobsearch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scan-helpers.js•8.96 KiB