MCP Job Search

http-deep-scan.js•14 KiB

import fetch from 'node-fetch'; import * as cheerio from 'cheerio'; import { SCAN_CONFIG } from './constants.js'; // HTTP-based deep scan implementation - more efficient than Playwright/Puppeteer export async function httpDeepScanSingleJob(agent, job, profile, scanPrompt) { if (!job.url) { throw new Error('Job URL is required for deep scanning'); } console.log(` → Fetching job URL via HTTP: ${job.url}`); const startTime = Date.now(); try { // Fetch the job page with browser-like headers const response = await fetch(job.url, { headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Cache-Control': 'max-age=0' }, // Add timeout to prevent hanging signal: AbortSignal.timeout(30000) // 30 second timeout }); const fetchTime = Date.now() - startTime; console.log(` → HTTP fetch completed in ${fetchTime}ms, status: ${response.status}`); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } // Parse HTML content const html = await response.text(); const $ = cheerio.load(html); // Extract page content similar to how Playwright does it const pageContent = { title: $('title').text() || '', url: job.url, fullContent: extractJobContent($) }; console.log(` → Extracted ${pageContent.fullContent.length} characters of content via HTTP`); // Validate that we got meaningful job content if (pageContent.fullContent.length < 500) { console.log(` ⚠️ Warning: Very little content extracted (${pageContent.fullContent.length} chars), may need JavaScript rendering`); } // Check for job-related content to ensure we got the right page const hasJobKeywords = /job|position|role|responsibilities|requirements|qualifications|description/i.test(pageContent.fullContent); if (!hasJobKeywords) { console.log(` ⚠️ Warning: No job-related keywords found in content, may be blocked or redirected`); } // Send full page content to LLM for extraction and matching const analysisResult = await analyzeJobPageWithLLM(agent, pageContent, job, profile, scanPrompt); console.log(` → HTTP-based deep scan completed for ${job.url}, match score: ${analysisResult.matchScore}`); return analysisResult; } catch (error) { const totalTime = Date.now() - startTime; console.log(` → HTTP fetch failed after ${totalTime}ms: ${error.message}`); // Log additional error details for debugging if (error.name === 'AbortError') { console.log(` → Request timed out after 30 seconds`); } else if (error.message.includes('HTTP')) { console.log(` → Server returned error response`); } else { console.log(` → Network or parsing error`); } throw error; } } // Extract meaningful job content from the parsed HTML function extractJobContent($) { // Remove script and style elements $('script, style, noscript').remove(); // Try to get the main body text, similar to document.body.innerText let fullContent = $('body').text(); // Clean up the text - remove excessive whitespace fullContent = fullContent .replace(/\s+/g, ' ') // Replace multiple whitespace with single space .replace(/\n\s*\n/g, '\n') // Remove empty lines .trim(); return fullContent; } // Analyze full job page content with LLM export async function analyzeJobPageWithLLM(agent, pageContent, job, profile, scanPrompt) { const prompt = `You are a job analysis system. Analyze the LinkedIn job page content and respond with ONLY a JSON object, no other text. Candidate Profile: ${profile} Additional Criteria: ${scanPrompt || 'None'} Job Page Content: ${pageContent.fullContent.substring(0, 8000)} Extract job details and provide a match score from 0.0 to 1.0. Respond with ONLY this JSON format (no additional text): { "title": "extracted job title", "company": "extracted company name", "location": "extracted location", "description": "extracted job description", "salary": "extracted salary or null", "matchScore": 0.8, "matchReason": "detailed explanation of match" }`; // Use OpenAI for analysis try { console.log(` → Analyzing full page with OpenAI...`); const response = await agent.openai.chat.completions.create({ model: agent.env.OPENAI_MODEL || 'gpt-4o', messages: [{ role: 'user', content: prompt }], temperature: 0.1, max_tokens: SCAN_CONFIG.MAX_LLM_TOKENS }); console.log(` → OpenAI analysis complete`); try { // Clean and parse the response let cleanResponse = response.choices[0].message.content .replace(/[\x00-\x1F\x7F-\x9F]/g, '') .replace(/\\n/g, '\n') .replace(/\\t/g, '\t') .trim(); // Try to extract JSON from the response - look for complete JSON objects let jsonMatch = cleanResponse.match(/\{[\s\S]*?"matchScore"[\s\S]*?\}/i); if (!jsonMatch) { // Try broader JSON pattern jsonMatch = cleanResponse.match(/\{[\s\S]*\}/i); } if (jsonMatch) { cleanResponse = jsonMatch[0]; } else { // If no JSON found, log the response and try to extract key info console.log(` → No JSON found in response: ${cleanResponse.substring(0, 200)}...`); throw new Error('No JSON structure found in AI response'); } console.log(` → Extracted JSON: ${cleanResponse.substring(0, 200)}...`); const result = JSON.parse(cleanResponse); const analysisResult = { title: result.title || job.title, company: result.company || job.company, location: result.location || job.location, description: result.description || 'No description extracted', salary: result.salary || null, matchScore: Math.max(0, Math.min(1, result.matchScore || 0)), matchReason: result.matchReason || 'No reason provided' }; console.log(` → AI analysis result prepared: ${analysisResult.title} at ${analysisResult.company}`); return analysisResult; } catch (parseError) { console.error('Error parsing OpenAI analysis:', parseError); console.log(` → Falling back to keyword matching due to parse error`); return fallbackJobMatching({ title: job.title, company: job.company, location: job.location, description: pageContent.fullContent.substring(0, 2000) }, profile, scanPrompt); } } catch (aiError) { console.log(` → OpenAI analysis failed: ${aiError.message}`); console.log(` → Using fallback keyword matching...`); return fallbackJobMatching({ title: job.title, company: job.company, location: job.location, description: pageContent.fullContent.substring(0, 2000) }, profile, scanPrompt); } } // Fallback keyword matching when AI is unavailable export function fallbackJobMatching(jobDetails, profile, scanPrompt) { try { console.log(` → Running fallback keyword matching...`); const jobText = `${jobDetails.title} ${jobDetails.company} ${jobDetails.location} ${jobDetails.description}`.toLowerCase(); const profileText = `${profile} ${scanPrompt}`.toLowerCase(); // Extract keywords from profile const profileKeywords = profileText.match(/\b\w{3,}\b/g) || []; const uniqueKeywords = [...new Set(profileKeywords)]; // Score based on keyword matches let matchCount = 0; let totalKeywords = Math.min(uniqueKeywords.length, 20); // Limit to top 20 keywords const matchedKeywords = []; for (const keyword of uniqueKeywords.slice(0, 20)) { if (jobText.includes(keyword)) { matchCount++; matchedKeywords.push(keyword); } } // Bonus scoring for important terms const bonusTerms = ['engineer', 'software', 'mechanical', 'new york', 'nyc', 'remote']; let bonusScore = 0; for (const term of bonusTerms) { if (jobText.includes(term) && profileText.includes(term)) { bonusScore += 0.1; } } // Calculate final score const baseScore = totalKeywords > 0 ? matchCount / totalKeywords : 0; const finalScore = Math.min(1.0, baseScore + bonusScore); const reason = `Keyword matching: ${matchCount}/${totalKeywords} keywords matched. ` + `Matched terms: ${matchedKeywords.slice(0, 5).join(', ')}${matchedKeywords.length > 5 ? '...' : ''}. ` + `Bonus score: +${bonusScore.toFixed(1)} for important terms.`; console.log(` → Fallback match score: ${finalScore.toFixed(2)}`); return { matchScore: Math.round(finalScore * 100) / 100, // Round to 2 decimals matchReason: reason }; } catch (error) { console.error('Error in fallback matching:', error); return { matchScore: 0.5, // Default neutral score matchReason: 'Fallback matching failed, assigned neutral score' }; } } // HTTP-based version of performDeepScan that doesn't need browser management export async function httpPerformDeepScan(agent) { try { // Get jobs that need deep scanning const jobIndex = await agent.env.JOB_STORAGE.get('job_index', 'json'); if (!jobIndex || !jobIndex.jobs) { console.log('No jobs found for deep scanning'); return; } const jobsToScan = jobIndex.jobs.filter(job => !job.scanned); console.log(`Found ${jobsToScan.length} jobs to deep scan via HTTP`); if (jobsToScan.length === 0) { return; } // Get plan for profile and scan prompt const plan = await agent.env.JOB_STORAGE.get('plan', 'json'); if (!plan || !plan.profile) { console.log('No profile found in plan for deep scanning'); return; } // Limit deep scan to avoid timeouts (configurable limit) const limitedJobs = jobsToScan.slice(0, SCAN_CONFIG.MAX_DEEP_SCAN_JOBS); console.log(`HTTP deep scanning ${limitedJobs.length} jobs (no browser needed)...`); // Initialize progress tracking agent.backgroundJobs.scan.deepScanProgress = { total: limitedJobs.length, completed: 0, current: null, errors: 0 }; // Process jobs sequentially using HTTP requests for (let i = 0; i < limitedJobs.length; i++) { const job = limitedJobs[i]; // Check for cancellation before each job if (agent.backgroundJobs.scan.cancelled) { console.log('Deep scan cancelled by user'); agent.backgroundJobs.scan.status = 'cancelled'; break; } // Update progress tracking agent.backgroundJobs.scan.deepScanProgress.current = { index: i + 1, title: job.title, company: job.company, url: job.url }; try { console.log(`HTTP deep scanning job ${i + 1}/${limitedJobs.length}: ${job.title} at ${job.company}`); const scanResult = await httpDeepScanSingleJob(agent, job, plan.profile, plan.scanPrompt || ''); // Update job with scan results job.scanned = true; job.scanDate = new Date().toISOString(); job.matchScore = scanResult.matchScore || 0; job.matchReason = scanResult.matchReason || ''; job.description = scanResult.description || job.description; job.requirements = scanResult.requirements || []; job.salary = scanResult.salary || null; job.scanStatus = 'completed'; agent.backgroundJobs.scan.deepScanProgress.completed++; console.log(`✓ HTTP job scan complete. Match score: ${job.matchScore}`); } catch (jobError) { console.error(`✗ Error in HTTP scan for job ${job.id} (${job.title}):`, jobError.message); // Check if error was due to cancellation if (jobError.message.includes('cancelled')) { console.log('HTTP deep scan cancelled during job processing'); agent.backgroundJobs.scan.status = 'cancelled'; break; } // Mark job as scanned but with error details job.scanned = true; job.scanDate = new Date().toISOString(); job.matchScore = 0; job.matchReason = 'HTTP scan failed due to error'; job.scanStatus = 'error'; job.scanError = { type: jobError.name || 'Error', message: jobError.message, timestamp: new Date().toISOString() }; // Log specific error types for debugging if (jobError.name === 'AbortError') { console.log(` → HTTP request timeout (likely slow response)`); job.scanError.reason = 'http_timeout'; } else if (jobError.message.includes('HTTP')) { console.log(` → HTTP error response from server`); job.scanError.reason = 'http_error'; } else { job.scanError.reason = 'unknown'; } agent.backgroundJobs.scan.deepScanProgress.errors++; console.log(` → Continuing with next job...`); } } // Save updated job index with scan statistics const completedJobs = limitedJobs.filter(j => j.scanStatus === 'completed').length; const errorJobs = limitedJobs.filter(j => j.scanStatus === 'error').length; await agent.env.JOB_STORAGE.put('job_index', JSON.stringify(jobIndex)); console.log(`HTTP deep scan phase completed:`); console.log(` ✓ Successfully scanned: ${completedJobs} jobs`); console.log(` ✗ Failed to scan: ${errorJobs} jobs`); console.log(` 📊 Total processing time saved by avoiding browser overhead`); } catch (error) { console.error('HTTP deep scan failed:', error.message); throw error; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/adamd9/mcp-jobsearch'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

http-deep-scan.js•14 KiB