Scrapy MCP Server

scrapy-mcp
examples

extraction_configs.py•12.1 KiB

""" Advanced extraction configuration examples. This file contains various extraction configurations for different types of websites and use cases, demonstrating the flexibility of the scraping system. """ # E-commerce product extraction ECOMMERCE_CONFIG = { "product_name": { "selector": "h1.product-title, .product-name h1, [data-testid='product-name']", "attr": "text", "multiple": False, }, "price": { "selector": ".price, .product-price, [data-testid='price']", "attr": "text", "multiple": False, }, "description": { "selector": ".product-description, .description, [data-testid='description']", "attr": "text", "multiple": False, }, "images": { "selector": ".product-image img, .gallery img", "attr": "src", "multiple": True, }, "availability": { "selector": ".availability, .stock-status, [data-testid='availability']", "attr": "text", "multiple": False, }, "rating": { "selector": ".rating, .stars, [data-testid='rating']", "attr": "text", "multiple": False, }, "specifications": { "selector": ".specifications li, .specs .spec-item", "attr": "text", "multiple": True, }, } # News article extraction NEWS_ARTICLE_CONFIG = { "headline": { "selector": "h1, .headline, .article-title", "attr": "text", "multiple": False, }, "author": { "selector": ".author, .byline, [rel='author']", "attr": "text", "multiple": False, }, "publish_date": { "selector": "time, .date, .published-date", "attr": "datetime", "multiple": False, }, "article_body": { "selector": ".article-body p, .content p, .post-content p", "attr": "text", "multiple": True, }, "tags": {"selector": ".tags a, .categories a", "attr": "text", "multiple": True}, "featured_image": { "selector": ".featured-image img, .article-image img", "attr": "src", "multiple": False, }, } # Social media profile extraction SOCIAL_PROFILE_CONFIG = { "username": { "selector": ".username, .handle, .profile-username", "attr": "text", "multiple": False, }, "display_name": { "selector": ".display-name, .profile-name, h1", "attr": "text", "multiple": False, }, "bio": { "selector": ".bio, .description, .profile-description", "attr": "text", "multiple": False, }, "follower_count": { "selector": ".followers-count, .follower-stats", "attr": "text", "multiple": False, }, "following_count": { "selector": ".following-count, .following-stats", "attr": "text", "multiple": False, }, "posts": {"selector": ".post, .tweet, .update", "attr": "text", "multiple": True}, "profile_image": { "selector": ".profile-image img, .avatar img", "attr": "src", "multiple": False, }, } # Job listing extraction JOB_LISTING_CONFIG = { "job_title": { "selector": "h1, .job-title, .position-title", "attr": "text", "multiple": False, }, "company_name": { "selector": ".company-name, .employer, .company", "attr": "text", "multiple": False, }, "location": { "selector": ".location, .job-location", "attr": "text", "multiple": False, }, "salary": { "selector": ".salary, .pay, .compensation", "attr": "text", "multiple": False, }, "job_type": { "selector": ".job-type, .employment-type", "attr": "text", "multiple": False, }, "description": { "selector": ".job-description, .description", "attr": "text", "multiple": False, }, "requirements": { "selector": ".requirements li, .qualifications li", "attr": "text", "multiple": True, }, "benefits": { "selector": ".benefits li, .perks li", "attr": "text", "multiple": True, }, "posted_date": { "selector": ".posted-date, .job-date, time", "attr": "text", "multiple": False, }, } # Real estate listing extraction REAL_ESTATE_CONFIG = { "property_title": { "selector": "h1, .property-title, .listing-title", "attr": "text", "multiple": False, }, "price": { "selector": ".price, .property-price, .listing-price", "attr": "text", "multiple": False, }, "address": { "selector": ".address, .property-address, .location", "attr": "text", "multiple": False, }, "bedrooms": { "selector": ".bedrooms, .beds, [data-testid='bedrooms']", "attr": "text", "multiple": False, }, "bathrooms": { "selector": ".bathrooms, .baths, [data-testid='bathrooms']", "attr": "text", "multiple": False, }, "square_footage": { "selector": ".square-feet, .sqft, .area", "attr": "text", "multiple": False, }, "description": { "selector": ".property-description, .listing-description", "attr": "text", "multiple": False, }, "features": { "selector": ".features li, .amenities li", "attr": "text", "multiple": True, }, "images": { "selector": ".property-images img, .listing-photos img", "attr": "src", "multiple": True, }, "agent_info": { "selector": ".agent-name, .realtor-name", "attr": "text", "multiple": False, }, } # Restaurant menu extraction RESTAURANT_MENU_CONFIG = { "restaurant_name": { "selector": "h1, .restaurant-name, .business-name", "attr": "text", "multiple": False, }, "menu_categories": { "selector": ".menu-category, .category-title", "attr": "text", "multiple": True, }, "menu_items": { "selector": ".menu-item", "attr": "outerHTML", # Get full HTML to parse item details "multiple": True, }, "item_names": { "selector": ".item-name, .dish-name", "attr": "text", "multiple": True, }, "item_prices": { "selector": ".item-price, .price", "attr": "text", "multiple": True, }, "item_descriptions": { "selector": ".item-description, .dish-description", "attr": "text", "multiple": True, }, "contact_info": { "selector": ".contact, .phone, .address", "attr": "text", "multiple": True, }, } # Academic paper/research extraction ACADEMIC_PAPER_CONFIG = { "title": { "selector": "h1, .article-title, .paper-title", "attr": "text", "multiple": False, }, "authors": { "selector": ".authors .author, .author-list .author", "attr": "text", "multiple": True, }, "abstract": {"selector": ".abstract, .summary", "attr": "text", "multiple": False}, "keywords": { "selector": ".keywords .keyword, .tags .tag", "attr": "text", "multiple": True, }, "publication_date": { "selector": ".pub-date, .published, time", "attr": "text", "multiple": False, }, "journal": { "selector": ".journal, .publication", "attr": "text", "multiple": False, }, "doi": {"selector": ".doi, [data-doi]", "attr": "text", "multiple": False}, "citations": { "selector": ".citation-count, .citations", "attr": "text", "multiple": False, }, "references": { "selector": ".references li, .bibliography li", "attr": "text", "multiple": True, }, } # Forum/discussion extraction FORUM_POST_CONFIG = { "thread_title": { "selector": "h1, .thread-title, .topic-title", "attr": "text", "multiple": False, }, "original_post": { "selector": ".original-post .content, .first-post .message", "attr": "text", "multiple": False, }, "post_author": { "selector": ".post-author, .username, .user-name", "attr": "text", "multiple": False, }, "post_date": { "selector": ".post-date, .timestamp, time", "attr": "text", "multiple": False, }, "replies": { "selector": ".reply .content, .post .message", "attr": "text", "multiple": True, }, "reply_authors": { "selector": ".reply .author, .post .username", "attr": "text", "multiple": True, }, "vote_count": { "selector": ".votes, .score, .points", "attr": "text", "multiple": False, }, "tags": { "selector": ".tags .tag, .categories .category", "attr": "text", "multiple": True, }, } # Event listing extraction EVENT_LISTING_CONFIG = { "event_name": { "selector": "h1, .event-title, .event-name", "attr": "text", "multiple": False, }, "event_date": { "selector": ".event-date, .date, time", "attr": "text", "multiple": False, }, "event_time": {"selector": ".event-time, .time", "attr": "text", "multiple": False}, "venue": { "selector": ".venue, .location, .event-location", "attr": "text", "multiple": False, }, "address": { "selector": ".address, .venue-address", "attr": "text", "multiple": False, }, "description": { "selector": ".event-description, .description", "attr": "text", "multiple": False, }, "ticket_price": { "selector": ".ticket-price, .price", "attr": "text", "multiple": False, }, "organizer": { "selector": ".organizer, .event-organizer", "attr": "text", "multiple": False, }, "categories": { "selector": ".categories .category, .tags .tag", "attr": "text", "multiple": True, }, } # Contact page extraction CONTACT_PAGE_CONFIG = { "company_name": { "selector": "h1, .company-name, .business-name", "attr": "text", "multiple": False, }, "phone_numbers": { "selector": ".phone, .tel, [href^='tel:']", "attr": "text", "multiple": True, }, "email_addresses": { "selector": ".email, [href^='mailto:']", "attr": "href", "multiple": True, }, "physical_address": { "selector": ".address, .location, .contact-address", "attr": "text", "multiple": False, }, "business_hours": { "selector": ".hours, .opening-hours, .business-hours", "attr": "text", "multiple": False, }, "social_media": { "selector": ".social-links a, .social-media a", "attr": "href", "multiple": True, }, "contact_form": {"selector": "form", "attr": "outerHTML", "multiple": False}, } # Collection of all configs for easy access EXTRACTION_CONFIGS = { "ecommerce": ECOMMERCE_CONFIG, "news": NEWS_ARTICLE_CONFIG, "social": SOCIAL_PROFILE_CONFIG, "jobs": JOB_LISTING_CONFIG, "realestate": REAL_ESTATE_CONFIG, "restaurant": RESTAURANT_MENU_CONFIG, "academic": ACADEMIC_PAPER_CONFIG, "forum": FORUM_POST_CONFIG, "events": EVENT_LISTING_CONFIG, "contact": CONTACT_PAGE_CONFIG, } def get_config_for_site_type(site_type: str): """Get extraction configuration for a specific site type.""" return EXTRACTION_CONFIGS.get(site_type.lower()) def print_all_configs(): """Print all available configurations.""" print("Available extraction configurations:") print("=" * 50) for name, config in EXTRACTION_CONFIGS.items(): print(f"\n{name.upper()} CONFIG:") print("-" * 20) for field, settings in config.items(): if isinstance(settings, dict): print( f" {field}: {settings['selector']} ({settings.get('attr', 'text')})" ) else: print(f" {field}: {settings}") if __name__ == "__main__": print_all_configs()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ThreeFish-AI/scrapy-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extraction_configs.py•12.1 KiB