import { JSDOM } from "jsdom";
import is_ip_private from "private-ip";
export interface SitemapUrl {
loc: string;
lastmod?: string;
changefreq?: string;
priority?: string;
}
export interface CategoryInfo {
id: string;
name: string;
url: string;
parent_id?: string;
}
export interface ProductInfo {
id: string;
name: string;
price: number;
category_id: string;
image_url?: string;
product_url?: string;
brand?: string;
unit?: string;
availability: string;
}
export interface ProductDetails {
id: string;
name: string;
price: number;
brand?: string | null;
description?: string | null;
nutrition_info?: string | null;
availability: string;
unit?: string | null;
image_url?: string | null;
product_url: string;
category_id?: string | null;
ingredients?: string | null;
allergens?: string | null;
storage_instructions?: string | null;
origin?: string | null;
organic: boolean;
gluten_free: boolean;
dairy_free: boolean;
vegan: boolean;
kosher: boolean;
halal: boolean;
}
export class SuperstoreSitemapParser {
private baseUrl = 'https://www.realcanadiansuperstore.ca';
private userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
private async fetch(url: string): Promise<Response> {
try {
if (is_ip_private(url)) {
throw new Error(
`Fetcher blocked an attempt to fetch a private IP ${url}. This is to prevent a security vulnerability where a local MCP could fetch privileged local IPs and exfiltrate data.`,
);
}
const response = await fetch(url, {
headers: {
"User-Agent": this.userAgent,
},
});
if (!response.ok) {
throw new Error(`HTTP error: ${response.status}`);
}
return response;
} catch (e: unknown) {
if (e instanceof Error) {
throw new Error(`Failed to fetch ${url}: ${e.message}`);
} else {
throw new Error(`Failed to fetch ${url}: Unknown error`);
}
}
}
private async getTextContent(url: string): Promise<string> {
const response = await this.fetch(url);
const html = await response.text();
const dom = new (JSDOM as any)(html);
const document = dom.window.document;
// Remove scripts and styles (like Fetch MCP)
const scripts = document.getElementsByTagName("script");
const styles = document.getElementsByTagName("style");
Array.from(scripts).forEach((script: any) => script.remove());
Array.from(styles).forEach((style: any) => style.remove());
const text = document.body.textContent || "";
return text.replace(/\s+/g, " ").trim();
}
/**
* Parse XML sitemap and extract URLs
*/
async parseSitemap(sitemapUrl: string): Promise<SitemapUrl[]> {
try {
const response = await this.fetch(sitemapUrl);
const xmlText = await response.text();
const dom = new JSDOM(xmlText, { contentType: "text/xml" });
const document = dom.window.document;
const urls: SitemapUrl[] = [];
const urlElements = document.querySelectorAll('url');
for (const urlElement of urlElements) {
const loc = urlElement.querySelector('loc')?.textContent;
const lastmod = urlElement.querySelector('lastmod')?.textContent;
const changefreq = urlElement.querySelector('changefreq')?.textContent;
const priority = urlElement.querySelector('priority')?.textContent;
if (loc) {
urls.push({
loc,
lastmod: lastmod || undefined,
changefreq: changefreq || undefined,
priority: priority || undefined
});
}
}
return urls;
} catch (error) {
throw new Error(`Failed to parse sitemap: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Extract category information from sitemap URLs
*/
extractCategoriesFromSitemap(sitemapUrls: SitemapUrl[]): CategoryInfo[] {
const categories: CategoryInfo[] = [];
const seen = new Set<string>();
for (const url of sitemapUrls) {
// Look for category URLs: /en/food/c/{categoryId}
const categoryMatch = url.loc.match(/\/en\/food\/c\/(\d+)/);
if (categoryMatch) {
const categoryId = categoryMatch[1];
if (!seen.has(categoryId)) {
seen.add(categoryId);
// Extract category name from URL path
const pathParts = url.loc.split('/');
const categoryName = pathParts[pathParts.length - 1] || `Category ${categoryId}`;
categories.push({
id: categoryId,
name: this.formatCategoryName(categoryName),
url: url.loc,
parent_id: undefined // Will be determined later
});
}
}
}
return categories;
}
/**
* Extract products from category page text content
*/
extractProductsFromText(text: string, categoryId: string): ProductInfo[] {
const products: ProductInfo[] = [];
// Product patterns based on Fetch MCP analysis
const productPatterns = [
// Pattern 1: Direct price after name
/([A-Za-z][^$]*?)\$(\d+\.?\d*)/g,
// Pattern 2: "about $X" pattern
/([A-Za-z][^$]*?)about \$(\d+\.?\d*)/g,
// Pattern 3: "was $X" pattern (sale items)
/([A-Za-z][^$]*?)was \$(\d+\.?\d*)/g,
// Pattern 4: Price with unit (e.g., "$1.50/1ea")
/([A-Za-z][^$]*?)\$(\d+\.?\d*)\/1ea/g,
// Pattern 5: Price per kg/lb
/([A-Za-z][^$]*?)\$(\d+\.?\d*)\/1kg/g
];
for (const pattern of productPatterns) {
let match;
while ((match = pattern.exec(text)) !== null) {
const name = this.cleanProductName(match[1].trim());
const price = parseFloat(match[2]);
// Filter out obvious non-products
if (this.isValidProduct(name, price)) {
const product: ProductInfo = {
id: this.generateProductId(name),
name: name,
price: price,
category_id: categoryId,
image_url: undefined,
product_url: undefined,
brand: this.extractBrand(name),
unit: this.extractUnit(name),
availability: 'in_stock'
};
// Avoid duplicates
if (!products.find(p => p.name === name)) {
products.push(product);
}
}
}
}
return products;
}
/**
* Get all products from a category with pagination
*/
async getCategoryProducts(categoryId: string, maxPages: number = 3): Promise<ProductInfo[]> {
const allProducts: ProductInfo[] = [];
let currentPage = 1;
while (currentPage <= maxPages) {
const categoryUrl = currentPage === 1
? `${this.baseUrl}/en/food/c/${categoryId}`
: `${this.baseUrl}/en/food/c/${categoryId}?page=${currentPage}`;
try {
const text = await this.getTextContent(categoryUrl);
const pageProducts = this.extractProductsFromText(text, categoryId);
if (pageProducts.length === 0) {
break; // No more products
}
allProducts.push(...pageProducts);
currentPage++;
} catch (error) {
console.error(`Failed to fetch page ${currentPage} for category ${categoryId}: ${error}`);
break;
}
}
return allProducts;
}
/**
* Get all categories and their products from sitemap
*/
async getAllCategoriesAndProducts(sitemapUrl: string = `${this.baseUrl}/sitemap.xml`, maxCategories: number = 50): Promise<{
categories: CategoryInfo[];
products: ProductInfo[];
}> {
try {
// Parse sitemap
const sitemapUrls = await this.parseSitemap(sitemapUrl);
// Extract categories
const categories = this.extractCategoriesFromSitemap(sitemapUrls);
// Get products from first N categories
const products: ProductInfo[] = [];
const categoriesToProcess = categories.slice(0, maxCategories);
for (const category of categoriesToProcess) {
try {
const categoryProducts = await this.getCategoryProducts(category.id, 2); // 2 pages per category
products.push(...categoryProducts);
} catch (error) {
console.error(`Failed to get products for category ${category.id}: ${error}`);
}
}
return { categories, products };
} catch (error) {
throw new Error(`Failed to get categories and products: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
// Helper methods
private formatCategoryName(name: string): string {
return name
.replace(/-/g, ' ')
.replace(/\b\w/g, l => l.toUpperCase())
.trim();
}
private cleanProductName(name: string): string {
return name
.replace(/\d+\s*(ea|kg|lb|g|ml|oz)/g, '') // Remove units
.replace(/\s+/g, ' ')
.trim();
}
private isValidProduct(name: string, price: number): boolean {
return name.length > 3 &&
name.length < 100 &&
price > 0 &&
price < 1000 &&
!name.includes('Add') &&
!name.includes('to cart') &&
!name.includes('SAVE') &&
!name.includes('was') &&
!name.includes('about');
}
private generateProductId(name: string): string {
return name.toLowerCase()
.replace(/[^a-zA-Z0-9]/g, '-')
.replace(/-+/g, '-')
.replace(/^-|-$/g, '')
.substring(0, 50);
}
private extractBrand(name: string): string | undefined {
const brandPatterns = [
/^([A-Z][a-z]+'s?)\s/, // Farmer's Market, President's Choice
/^([A-Z][a-z]+)\s/, // Rooster, No Name
];
for (const pattern of brandPatterns) {
const match = name.match(pattern);
if (match) {
return match[1];
}
}
return undefined;
}
private extractUnit(name: string): string | undefined {
const unitMatch = name.match(/(\d+\s*(ea|kg|lb|g|ml|oz))/);
return unitMatch ? unitMatch[1] : undefined;
}
/**
* Get detailed product information from category page or product URL
*/
async getProductDetails(productUrl: string): Promise<ProductDetails> {
try {
// For individual product pages, we need to execute JavaScript to get the full content
const { JSDOM } = await import('jsdom');
const dom = await JSDOM.fromURL(productUrl, {
runScripts: "dangerously",
resources: "usable",
pretendToBeVisual: true,
userAgent: this.userAgent
});
// Wait for the page to load
await new Promise(resolve => setTimeout(resolve, 3000));
const document = dom.window.document;
const html = document.documentElement.outerHTML;
return await this.parseProductDetailsFromText(html, productUrl);
} catch (error) {
throw new Error(`Failed to get product details: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Get detailed product information by searching for a specific product in a category
*/
async getProductDetailsByName(categoryId: string, productName: string): Promise<ProductDetails | null> {
try {
// First, get the category page to find individual product URLs
const categoryUrl = `${this.baseUrl}/en/food/c/${categoryId}`;
const response = await this.fetch(categoryUrl);
const html = await response.text();
// Look for product links in the HTML
const productLinkMatches = html.match(/href="([^"]*\/p\/[^"]*)"/g);
if (productLinkMatches) {
const productUrls = productLinkMatches.map(match => {
const url = match.replace('href="', '').replace('"', '');
return url.startsWith('http') ? url : `${this.baseUrl}${url}`;
});
// Search through product URLs to find the matching product
for (const productUrl of productUrls.slice(0, 10)) { // Limit to first 10 for performance
try {
const productDetails = await this.getProductDetails(productUrl);
// Check if this product matches our search term
if (productDetails.name.toLowerCase().includes(productName.toLowerCase())) {
// Set the category ID
productDetails.category_id = categoryId;
return productDetails;
}
} catch (error) {
// Continue to next product if this one fails
continue;
}
}
}
// Fallback: Use the basic category page extraction
const products = await this.getCategoryProducts(categoryId, 1);
const matchingProduct = products.find(product =>
product.name.toLowerCase().includes(productName.toLowerCase())
);
if (matchingProduct) {
// Convert ProductInfo to ProductDetails
return {
id: matchingProduct.id,
name: matchingProduct.name,
price: matchingProduct.price,
brand: matchingProduct.brand || null,
description: null,
nutrition_info: null,
availability: matchingProduct.availability,
unit: matchingProduct.unit || null,
image_url: matchingProduct.image_url || null,
product_url: matchingProduct.product_url || '',
category_id: matchingProduct.category_id,
ingredients: null,
allergens: null,
storage_instructions: null,
origin: null,
organic: matchingProduct.name.toLowerCase().includes('organic'),
gluten_free: matchingProduct.name.toLowerCase().includes('gluten free'),
dairy_free: matchingProduct.name.toLowerCase().includes('dairy free'),
vegan: matchingProduct.name.toLowerCase().includes('vegan'),
kosher: matchingProduct.name.toLowerCase().includes('kosher'),
halal: matchingProduct.name.toLowerCase().includes('halal')
};
}
return null;
} catch (error) {
throw new Error(`Failed to get product details by name: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Parse detailed product information from HTML content
*/
private async parseProductDetailsFromText(html: string, productUrl: string): Promise<ProductDetails> {
// Use JSDOM to parse HTML for better extraction
const { JSDOM } = await import('jsdom');
const dom = new (JSDOM as any)(html);
const document = dom.window.document;
// Extract product name using multiple strategies
let productName = 'Unknown Product';
// Strategy 1: Look for product title elements
const productTitleElement = document.querySelector('[data-testid="product-title"]') ||
document.querySelector('h1[data-testid*="title"]') ||
document.querySelector('h2[data-testid*="title"]') ||
document.querySelector('h3[data-testid*="title"]');
if (productTitleElement) {
productName = productTitleElement.textContent?.trim() || 'Unknown Product';
} else {
// Strategy 2: Look for title in page title or meta tags
const pageTitle = document.querySelector('title')?.textContent || '';
const metaTitle = document.querySelector('meta[property="og:title"]')?.getAttribute('content') || '';
if (pageTitle && !pageTitle.includes('Real Canadian Superstore')) {
productName = pageTitle.split('|')[0].trim();
} else if (metaTitle) {
productName = metaTitle;
} else {
// Strategy 3: Extract from URL path
const urlPath = productUrl.split('/').pop()?.split('?')[0] || '';
if (urlPath) {
productName = urlPath.replace(/-/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
}
}
}
// Extract price using multiple strategies
let price = 0;
// Strategy 1: Look for price elements
const priceElement = document.querySelector('[data-testid="price"]') ||
document.querySelector('[data-testid*="price"]') ||
document.querySelector('.price') ||
document.querySelector('[class*="price"]');
if (priceElement) {
const priceText = priceElement.textContent || '';
const priceMatch = priceText.match(/\$(\d+\.?\d*)/);
price = priceMatch ? parseFloat(priceMatch[1]) : 0;
} else {
// Strategy 2: Search entire document for price patterns
const priceMatches = html.match(/\$(\d+\.?\d*)/g);
if (priceMatches && priceMatches.length > 0) {
// Take the first reasonable price (not $0.00)
for (const priceMatch of priceMatches) {
const priceValue = parseFloat(priceMatch.replace('$', ''));
if (priceValue > 0 && priceValue < 1000) {
price = priceValue;
break;
}
}
}
}
// Extract brand using multiple strategies
let brand: string | null = null;
const brandElement = document.querySelector('[data-testid="brand"]') ||
document.querySelector('[data-testid*="brand"]') ||
document.querySelector('.brand') ||
document.querySelector('[class*="brand"]');
if (brandElement) {
brand = brandElement.textContent?.trim() || null;
} else {
// Try to extract brand from product name
const brandMatch = productName.match(/^([A-Za-z][a-zA-Z'&\s.-]*?)(?=[A-Z]|\d|\s\S)/);
if (brandMatch && brandMatch[1].length > 1 && brandMatch[1].length < 20) {
brand = brandMatch[1].trim();
}
}
// Extract description using multiple strategies
let description: string | null = null;
const descriptionElement = document.querySelector('[data-testid="description"]') ||
document.querySelector('[data-testid="product-description"]') ||
document.querySelector('.description') ||
document.querySelector('[class*="description"]') ||
document.querySelector('meta[name="description"]');
if (descriptionElement) {
description = descriptionElement.textContent?.trim() ||
descriptionElement.getAttribute('content')?.trim() || null;
} else {
// Strategy 2: Look for meta description
const metaDesc = document.querySelector('meta[name="description"]')?.getAttribute('content');
if (metaDesc && !metaDesc.includes('Real Canadian Superstore')) {
description = metaDesc;
}
}
// Extract unit/size information
let unit: string | null = null;
const unitElement = document.querySelector('[data-testid="unit"]') ||
document.querySelector('[data-testid="size"]') ||
document.querySelector('.unit') ||
document.querySelector('[class*="unit"]');
if (unitElement) {
unit = unitElement.textContent?.trim() || null;
} else {
// Try to extract unit from product name or price text
const unitMatch = productName.match(/(\d+\s*(?:g|kg|ml|l|oz|lb|each|pack|bunch|ea|count))/i) ||
html.match(/(\d+\s*(?:g|kg|ml|l|oz|lb|each|pack|bunch|ea|count))/i);
unit = unitMatch ? unitMatch[1].trim() : null;
}
// Extract image URL using multiple strategies
let imageUrl: string | null = null;
const imageElement = document.querySelector('[data-testid="product-image"] img') ||
document.querySelector('img[alt*="' + productName + '"]') ||
document.querySelector('.product-image img') ||
document.querySelector('[class*="product-image"] img') ||
document.querySelector('meta[property="og:image"]');
if (imageElement) {
imageUrl = imageElement.getAttribute('src') ||
imageElement.getAttribute('data-src') ||
imageElement.getAttribute('content') ||
imageElement.getAttribute('data-lazy-src');
} else {
// Strategy 2: Look for og:image meta tag
const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content');
if (ogImage) {
imageUrl = ogImage;
}
}
// Extract availability status
let availability = 'in_stock';
const availabilityElement = document.querySelector('[data-testid="availability"]') ||
document.querySelector('[data-testid="stock-status"]') ||
document.querySelector('.availability') ||
document.querySelector('[class*="availability"]');
if (availabilityElement) {
const availabilityText = availabilityElement.textContent?.toLowerCase() || '';
if (availabilityText.includes('out of stock') || availabilityText.includes('unavailable')) {
availability = 'out_of_stock';
} else if (availabilityText.includes('limited')) {
availability = 'limited';
}
} else {
// Check for out of stock indicators in the HTML
if (html.toLowerCase().includes('out of stock') || html.toLowerCase().includes('unavailable')) {
availability = 'out_of_stock';
}
}
// Extract nutritional information
let nutritionInfo: string | null = null;
const nutritionElement = document.querySelector('[data-testid="nutrition"]') ||
document.querySelector('[data-testid="nutrition-facts"]') ||
document.querySelector('.nutrition') ||
document.querySelector('[class*="nutrition"]');
if (nutritionElement) {
nutritionInfo = nutritionElement.textContent?.trim() || null;
}
// Extract ingredients
let ingredients: string | null = null;
const ingredientsElement = document.querySelector('[data-testid="ingredients"]') ||
document.querySelector('.ingredients') ||
document.querySelector('[class*="ingredients"]');
if (ingredientsElement) {
ingredients = ingredientsElement.textContent?.trim() || null;
}
// Extract allergens
let allergens: string | null = null;
const allergensElement = document.querySelector('[data-testid="allergens"]') ||
document.querySelector('.allergens') ||
document.querySelector('[class*="allergens"]');
if (allergensElement) {
allergens = allergensElement.textContent?.trim() || null;
}
// Extract storage instructions
let storageInstructions: string | null = null;
const storageElement = document.querySelector('[data-testid="storage"]') ||
document.querySelector('[data-testid="storage-instructions"]') ||
document.querySelector('.storage') ||
document.querySelector('[class*="storage"]');
if (storageElement) {
storageInstructions = storageElement.textContent?.trim() || null;
}
// Extract origin
let origin: string | null = null;
const originElement = document.querySelector('[data-testid="origin"]') ||
document.querySelector('[data-testid="country-of-origin"]') ||
document.querySelector('.origin') ||
document.querySelector('[class*="origin"]');
if (originElement) {
origin = originElement.textContent?.trim() || null;
}
// Determine dietary attributes
const productText = productName.toLowerCase();
const fullText = html.toLowerCase();
return {
id: this.generateProductId(productName),
name: productName,
price: price,
brand: brand,
description: description,
nutrition_info: nutritionInfo,
availability: availability,
unit: unit,
image_url: imageUrl,
product_url: productUrl,
category_id: null, // Will be set by caller if known
ingredients: ingredients,
allergens: allergens,
storage_instructions: storageInstructions,
origin: origin,
organic: productText.includes('organic') || fullText.includes('organic'),
gluten_free: productText.includes('gluten free') || fullText.includes('gluten free'),
dairy_free: productText.includes('dairy free') || fullText.includes('dairy free'),
vegan: productText.includes('vegan') || fullText.includes('vegan'),
kosher: productText.includes('kosher') || fullText.includes('kosher'),
halal: productText.includes('halal') || fullText.includes('halal')
};
}
}