Skip to main content
Glama
amineremache

Daft.ie MCP Server

by amineremache
parser.service.ts14.3 kB
import * as cheerio from 'cheerio'; import { config } from '../config.js'; import { parsePrice, parseBeds, slugify, extractLatLng, formatBer } from '../utils/parser.utils.js'; import type { Property, Unit } from '../types.js'; import logger from '../logger.js'; /** * Parses the HTML of a search results page to extract property listings. * @param html The HTML content of the search results page. * @returns An array of property objects. */ export function parseSearchResults(html: string): Partial<Property>[] { const $ = cheerio.load(html); const properties: Partial<Property>[] = []; $('ul[data-testid="results"] li[data-testid^="result-"]').each((index, element) => { const $element = $(element); const property: Partial<Property> = {}; property.address = $element.find('div[data-tracking="srp_address"] p').first().text().trim(); let mainLink = $element.find('a[data-testid="card-link"]').first(); if (mainLink.length === 0) { mainLink = $element.find('a.sc-e4e4a161-16.dukjos').first(); } const relativeUrl = mainLink.attr('href'); if (relativeUrl) { property.url = `${config.daft.baseUrl}${relativeUrl}`; const urlParts = relativeUrl.split('/'); property.id = urlParts[urlParts.length - 1]; } if (!property.id) { const dataTestId = $element.attr('data-testid'); if (dataTestId) property.id = dataTestId.replace('result-', ''); } if (!property.url && property.id && property.address) { const addressSlug = slugify(property.address); property.url = `${config.daft.baseUrl}/for-rent/${addressSlug}/${property.id}`; } property.tagline = $element.find('div[data-tracking="srp_tagline"] p').first().text().trim(); let mainPriceString = ''; const priceSelectors = [ 'p[data-testid="price"]', '[class*="TitleBlock__StyledSpan"][class*="price"]', 'p.csEcJw', 'div[data-testid="card-price"] p', ]; for (const selector of priceSelectors) { const priceElement = $element.find(selector).first(); if (priceElement.length) { mainPriceString = priceElement.text().trim(); if (mainPriceString) break; } } if (!mainPriceString) { const potentialPriceElement = $element.find('p[class^="sc-4c172e97-0"]').first(); if (potentialPriceElement.length) { const potentialPriceText = potentialPriceElement.text().trim(); if ( potentialPriceText.includes('€') || /per (month|week)/i.test(potentialPriceText) || /POA|price on application/i.test(potentialPriceText) ) { mainPriceString = potentialPriceText; } } } property.priceString = mainPriceString; const parsedPriceResult = parsePrice(mainPriceString); property.parsedPrice = parsedPriceResult.value; property.priceType = parsedPriceResult.type; let mainBedsString = $element.find('[data-testid="beds"]').text().trim(); const mainBathsString = $element.find('[data-testid="baths"]').text().trim(); let mainPropertyTypeString = $element.find('[data-testid="property-type"]').text().trim(); if (!mainBedsString) { $element.find('p, span, li').each((i, el) => { const text = $(el).text().trim(); if (text.match(/^\d+\s*Bed/i) || text.toLowerCase().includes('studio')) { mainBedsString = text; return false; } }); } property.bedsString = mainBedsString; property.parsedBeds = parseBeds(mainBedsString); property.bathsString = mainBathsString; if (!mainPropertyTypeString) { $element.find('p, span, li').each((i, el) => { const text = $(el).text().trim(); const lowerText = text.toLowerCase(); if (lowerText === 'apartment' || lowerText === 'house' || lowerText === 'studio' || lowerText === 'flat') { mainPropertyTypeString = text; return false; } }); } property.propertyTypeString = mainPropertyTypeString; let ber = $element.find('div[data-testid="callout-container"] [aria-label^="BER"]').attr('aria-label'); if (!ber) { ber = $element.find('div[data-tracking="srp_ber"]').attr('aria-label'); } if (!ber) { ber = $element.find('p[data-testid="ber"]').text().trim(); } if (ber) { property.ber = formatBer(ber); } const latLngElement = $element.find('div.sc-eb305aa9-35.jfAOAq'); if (latLngElement.length > 0) { const latLng = extractLatLng(latLngElement.html() || '', 'a[data-testid="satelite-button"]'); if (latLng) { property.latitude = latLng.lat; property.longitude = latLng.lng; } } const unitContainer = $element.children('div[data-testid="card-container"]'); const unitElements = unitContainer.find('a[href*="/for-rent/"]'); if (unitElements.length > 0) { property.units = []; unitElements.each((unitIndex, unitElement) => { const $unit = $(unitElement); const unitDetails: Partial<Unit> = {}; const unitRelativeUrl = $unit.attr('href'); if (unitRelativeUrl) { unitDetails.url = `${config.daft.baseUrl}${unitRelativeUrl}`; const unitUrlParts = unitRelativeUrl.split('/'); unitDetails.id = unitUrlParts[unitUrlParts.length - 1]; } else { return; } const priceElement = $unit.find('p[class*="jmFLnF"]').first(); unitDetails.priceString = priceElement.text().trim(); const unitParsedPriceResult = parsePrice(unitDetails.priceString); unitDetails.parsedPrice = unitParsedPriceResult.value; unitDetails.priceType = unitParsedPriceResult.type; let bedsString, bathsString, propertyTypeString; const detailsContainer = $unit.find('div[class*="eKLMRy"]'); if (detailsContainer.length > 0) { const spans = detailsContainer.find('span'); bedsString = spans .filter((i, el) => $(el).attr('data-testid') === 'mc-details-first-item') .text() .trim(); bathsString = spans .filter((i, el) => $(el).attr('data-testid') === 'mc-details-second-item') .text() .trim(); propertyTypeString = spans .filter((i, el) => $(el).attr('data-testid') === 'mc-details-third-item') .text() .trim(); } else { const infoSpans = $unit.find('div[class*="kzXTWf"] span'); if (infoSpans.length > 0) { bedsString = infoSpans.eq(0).text().trim(); if (infoSpans.length > 1) { bathsString = infoSpans.eq(1).text().trim(); } if (infoSpans.length > 2) { propertyTypeString = infoSpans.eq(2).text().trim(); } } } unitDetails.bedsString = bedsString; unitDetails.parsedBeds = parseBeds(bedsString); unitDetails.bathsString = bathsString; unitDetails.propertyTypeString = propertyTypeString; if (unitDetails.id) { if ( property.units && unitDetails.id && unitDetails.url && unitDetails.priceString && unitDetails.bedsString && unitDetails.bathsString && unitDetails.propertyTypeString ) { property.units.push(unitDetails as Unit); } } }); if (property.units.length > 0) { const firstUnit = property.units[0]; if (property.priceType === 'unknown' || !property.parsedPrice) { property.parsedPrice = firstUnit.parsedPrice; property.priceString = firstUnit.priceString; property.priceType = firstUnit.priceType; } if (!property.parsedBeds) { property.parsedBeds = firstUnit.parsedBeds; property.bedsString = firstUnit.bedsString; } if (!property.propertyTypeString) { property.propertyTypeString = firstUnit.propertyTypeString; } } } properties.push(property); }); return properties; } /** * Parses the HTML of a property details page to extract additional information. * @param html The HTML content of the property details page. * @param property The property object to enhance. * @returns The enhanced property object. */ export function parsePropertyDetails(html: string, property: Partial<Property>): Partial<Property> { const $ = cheerio.load(html); // First, get the definitive lat/lng for this page const latLngElement = $('div.sc-eb305aa9-35.jfAOAq'); if (latLngElement.length > 0) { const latLng = extractLatLng(latLngElement.html() || '', 'a[data-testid="satelite-button"]'); if (latLng) { property.latitude = latLng.lat; property.longitude = latLng.lng; } } const subUnitElements = $('a[data-testid="sub-unit"]'); logger.info(`[parser.service.ts] Found ${subUnitElements.length} sub-units for ${property.url}`); if (subUnitElements.length > 0) { // This is a development page. We will parse its sub-units and return them as individual properties, // discarding the parent "development" property which is not a rentable unit itself. const subProperties: Property[] = []; subUnitElements.each((index, element) => { const $unit = $(element); const unitDetails: Partial<Property> = { // Inherit from parent property address: property.address, tagline: property.tagline, latitude: property.latitude, longitude: property.longitude, }; const relativeUrl = $unit.attr('href'); if (!relativeUrl) return; // Skip if no URL unitDetails.url = `${config.daft.baseUrl}${relativeUrl}`; const urlParts = relativeUrl.split('/'); unitDetails.id = urlParts[urlParts.length - 1]; unitDetails.priceString = $unit.find('p[data-testid="mc-title"]').text().trim(); const parsedPrice = parsePrice(unitDetails.priceString); unitDetails.parsedPrice = parsedPrice.value; unitDetails.priceType = parsedPrice.type; const detailsContainer = $unit.find('div[class*="eKLMRy"]'); const spans = detailsContainer.find('span'); unitDetails.bedsString = spans .filter((i, el) => $(el).attr('data-testid') === 'mc-details-first-item') .text() .trim(); unitDetails.bathsString = spans .filter((i, el) => $(el).attr('data-testid') === 'mc-details-second-item') .text() .trim(); unitDetails.propertyTypeString = spans .filter((i, el) => $(el).attr('data-testid') === 'mc-details-third-item') .text() .trim(); unitDetails.parsedBeds = parseBeds(unitDetails.bedsString); if (unitDetails.parsedBeds?.isStudio && !unitDetails.propertyTypeString) { unitDetails.propertyTypeString = 'Studio'; } const berElement = $unit.find('div[data-testid="mc-ber"] div'); if (berElement.length > 0) { const berAriaLabel = berElement.attr('aria-label'); if (berAriaLabel) { unitDetails.ber = formatBer(berAriaLabel); } } // Ensure we have a valid property before adding it if (unitDetails.id && unitDetails.url && unitDetails.address && unitDetails.priceString) { subProperties.push(unitDetails as Property); } else { const missingData = []; if (!unitDetails.id) missingData.push('id'); if (!unitDetails.url) missingData.push('url'); if (!unitDetails.address) missingData.push('address'); if (!unitDetails.priceString) missingData.push('priceString'); logger.warn( `[parser.service.ts] Skipping sub-unit due to missing essential data: [${missingData.join( ', ' )}]. URL: ${unitDetails.url}` ); } }); logger.info( `[parser.service.ts] Replaced development page ${property.url} with ${subProperties.length} sub-properties.` ); // This is a bit of a hack, but we're returning the sub-properties in the 'units' field // so the main scraper service can flatten them. property.units = subProperties as Unit[]; } else { logger.info(`[parser.service.ts] No sub-units found for ${property.url}. Treating as single property.`); // This is a regular page, enhance the original property with any missing details if (!property.ber) { let detailBer = $('div[data-testid="callout-container"] [aria-label^="BER"]').attr('aria-label'); if (!detailBer) { detailBer = $('p[data-testid="ber"]').text().trim(); } if (!detailBer) { detailBer = $('div[data-testid="ber-container"] p').text().trim(); } if (detailBer) { property.ber = formatBer(detailBer); } } } return property; } /** * Determines the total number of pages from the search results HTML. * @param html The HTML content of the search results page. * @returns The total number of pages. */ export function getTotalPages(html: string): number { const $ = cheerio.load(html); const totalResultsString = $('span.sc-4c172e97-0.ioLdWh').first().text(); logger.info(`[parser.service.ts] Found total results string: "${totalResultsString}"`); if (totalResultsString) { const match = totalResultsString.match(/of\s*([\d,]+)\s*total results/i); if (match && match[1]) { const totalResultsRaw = match[1].replace(/,/g, ''); const totalResults = parseInt(totalResultsRaw, 10); if (!isNaN(totalResults)) { const calculatedPages = Math.ceil(totalResults / 20); // 20 results per page const totalPages = Math.min(calculatedPages, config.scraping.maxPagesToFetch); if (calculatedPages > config.scraping.maxPagesToFetch) { logger.warn( `[parser.service.ts] Calculated total pages (${calculatedPages}) exceeds config.scraping.maxPagesToFetch (${config.scraping.maxPagesToFetch}). Capping at ${totalPages}.` ); } logger.info( `[parser.service.ts] Extracted total results: ${totalResults}, calculated pages: ${calculatedPages}, effective total pages: ${totalPages}` ); return totalPages; } } } return 1; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/amineremache/daft-ie-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server