FedMCP - Federal Parliamentary Information

paywall_detection.py•4.83 KiB

"""
Article-by-article paywall detection.

Detects whether individual news articles are behind a paywall by analyzing:
1. Content truncation patterns
2. Paywall-specific phrases in summaries
3. Source characteristics (metered vs hard paywall)
"""

import re
from typing import Tuple

# Sources with HARD paywalls (almost all content requires subscription)
HARD_PAYWALL_SOURCES = {
    "globe",           # Globe & Mail
    "thehill",         # Hill Times
    "ipolitics",       # iPolitics
}

# Sources with METERED paywalls (some free articles, some paywalled)
METERED_PAYWALL_SOURCES = {
    "toronto_star",
    "national_post",
    "winnipeg_free_press",
    "vancouver_sun",
    "ottawa_citizen",
    "montreal_gazette",
    "calgary_herald",
    "edmonton_journal",
}

# Sources that are always free
FREE_SOURCES = {
    "cbc",
    "ctv",
    "global_news",
    "canadaland",
    "thenarwhal",
    "legisinfo",
}

# Paywall indicator phrases (case-insensitive)
PAYWALL_PHRASES = [
    # English
    r"subscribe\s+to\s+(continue|read|access)",
    r"subscription\s+required",
    r"subscribers\s+only",
    r"for\s+subscribers",
    r"sign\s+in\s+to\s+read",
    r"unlock\s+(this|full)\s+(story|article)",
    r"to\s+read\s+this\s+article",
    r"exclusive\s+to\s+subscribers",
    r"premium\s+(content|article)",
    r"members(\s+|-)only",
    r"already\s+a\s+subscriber\?",
    r"start\s+your\s+free\s+trial",
    r"get\s+full\s+access",
    r"limited\s+time\s+offer",
    r"become\s+a\s+(member|subscriber)",
    r"join\s+now\s+to\s+read",
    # French
    r"abonnez-vous",
    r"réservé\s+aux\s+abonnés",
    r"contenu\s+réservé",
    r"pour\s+lire\s+cet\s+article",
    r"déjà\s+abonné\s*\?",
    r"accès\s+complet",
]

# Compiled regex for efficiency
PAYWALL_PATTERN = re.compile(
    "|".join(PAYWALL_PHRASES),
    re.IGNORECASE
)

# Truncation indicators (often appear at end of paywalled summaries)
TRUNCATION_INDICATORS = [
    r"\.\.\.$",
    r"…$",
    r"\[\.\.\.\]$",
    r"\[more\]$",
    r"read\s+more\.?$",
    r"continue\s+reading\.?$",
]

TRUNCATION_PATTERN = re.compile(
    "|".join(TRUNCATION_INDICATORS),
    re.IGNORECASE
)


def detect_paywall(
    source_id: str,
    title: str,
    summary: str,
    full_content: str = "",
) -> Tuple[bool, float, str]:
    """
    Detect if an article is likely behind a paywall.

    Args:
        source_id: The news source identifier
        title: Article title
        summary: Article summary/description from RSS
        full_content: Full article text if available (optional)

    Returns:
        Tuple of (is_paywalled, confidence, reason)
        - is_paywalled: True if article appears to be paywalled
        - confidence: 0.0-1.0 confidence score
        - reason: Human-readable explanation
    """
    text_to_check = f"{title} {summary} {full_content}".strip()

    # Check if source is always free
    if source_id in FREE_SOURCES:
        return False, 1.0, "Source is always free"

    # Check for explicit paywall phrases
    if PAYWALL_PATTERN.search(text_to_check):
        return True, 0.95, "Contains paywall phrase"

    # Hard paywall sources - assume paywalled unless clearly free
    if source_id in HARD_PAYWALL_SOURCES:
        # Check for "free to read" indicators
        free_indicators = [
            r"free\s+to\s+read",
            r"no\s+subscription",
            r"open\s+access",
        ]
        for pattern in free_indicators:
            if re.search(pattern, text_to_check, re.IGNORECASE):
                return False, 0.7, "Hard paywall source but marked as free"

        return True, 0.85, "Source has hard paywall"

    # Metered paywall sources - check for truncation signals
    if source_id in METERED_PAYWALL_SOURCES:
        # Very short summary often indicates paywall
        if len(summary) < 100:
            return True, 0.7, "Metered source with very short summary"

        # Check for truncation indicators
        if TRUNCATION_PATTERN.search(summary):
            return True, 0.75, "Metered source with truncated content"

        # If summary seems complete, likely free article
        if len(summary) > 300 and not TRUNCATION_PATTERN.search(summary):
            return False, 0.6, "Metered source with full summary"

        # Default for metered sources - slight lean toward paywalled
        return True, 0.55, "Metered source (uncertain)"

    # Unknown source - check content signals only
    if TRUNCATION_PATTERN.search(summary) and len(summary) < 150:
        return True, 0.5, "Unknown source with truncated content"

    return False, 0.5, "No paywall indicators detected"


def is_paywalled(source_id: str, title: str, summary: str) -> bool:
    """
    Simple boolean check for paywall status.

    Uses a threshold of 0.6 confidence for marking as paywalled.
    """
    paywalled, confidence, _ = detect_paywall(source_id, title, summary)
    return paywalled and confidence >= 0.6

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/northernvariables/FedMCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

paywall_detection.py•4.83 KiB

"""
Article-by-article paywall detection.

Detects whether individual news articles are behind a paywall by analyzing:
1. Content truncation patterns
2. Paywall-specific phrases in summaries
3. Source characteristics (metered vs hard paywall)
"""

import re
from typing import Tuple

# Sources with HARD paywalls (almost all content requires subscription)
HARD_PAYWALL_SOURCES = {
    "globe",           # Globe & Mail
    "thehill",         # Hill Times
    "ipolitics",       # iPolitics
}

# Sources with METERED paywalls (some free articles, some paywalled)
METERED_PAYWALL_SOURCES = {
    "toronto_star",
    "national_post",
    "winnipeg_free_press",
    "vancouver_sun",
    "ottawa_citizen",
    "montreal_gazette",
    "calgary_herald",
    "edmonton_journal",
}

# Sources that are always free
FREE_SOURCES = {
    "cbc",
    "ctv",
    "global_news",
    "canadaland",
    "thenarwhal",
    "legisinfo",
}

# Paywall indicator phrases (case-insensitive)
PAYWALL_PHRASES = [
    # English
    r"subscribe\s+to\s+(continue|read|access)",
    r"subscription\s+required",
    r"subscribers\s+only",
    r"for\s+subscribers",
    r"sign\s+in\s+to\s+read",
    r"unlock\s+(this|full)\s+(story|article)",
    r"to\s+read\s+this\s+article",
    r"exclusive\s+to\s+subscribers",
    r"premium\s+(content|article)",
    r"members(\s+|-)only",
    r"already\s+a\s+subscriber\?",
    r"start\s+your\s+free\s+trial",
    r"get\s+full\s+access",
    r"limited\s+time\s+offer",
    r"become\s+a\s+(member|subscriber)",
    r"join\s+now\s+to\s+read",
    # French
    r"abonnez-vous",
    r"réservé\s+aux\s+abonnés",
    r"contenu\s+réservé",
    r"pour\s+lire\s+cet\s+article",
    r"déjà\s+abonné\s*\?",
    r"accès\s+complet",
]

# Compiled regex for efficiency
PAYWALL_PATTERN = re.compile(
    "|".join(PAYWALL_PHRASES),
    re.IGNORECASE
)

# Truncation indicators (often appear at end of paywalled summaries)
TRUNCATION_INDICATORS = [
    r"\.\.\.$",
    r"…$",
    r"\[\.\.\.\]$",
    r"\[more\]$",
    r"read\s+more\.?$",
    r"continue\s+reading\.?$",
]

TRUNCATION_PATTERN = re.compile(
    "|".join(TRUNCATION_INDICATORS),
    re.IGNORECASE
)


def detect_paywall(
    source_id: str,
    title: str,
    summary: str,
    full_content: str = "",
) -> Tuple[bool, float, str]:
    """
    Detect if an article is likely behind a paywall.

    Args:
        source_id: The news source identifier
        title: Article title
        summary: Article summary/description from RSS
        full_content: Full article text if available (optional)

    Returns:
        Tuple of (is_paywalled, confidence, reason)
        - is_paywalled: True if article appears to be paywalled
        - confidence: 0.0-1.0 confidence score
        - reason: Human-readable explanation
    """
    text_to_check = f"{title} {summary} {full_content}".strip()

    # Check if source is always free
    if source_id in FREE_SOURCES:
        return False, 1.0, "Source is always free"

    # Check for explicit paywall phrases
    if PAYWALL_PATTERN.search(text_to_check):
        return True, 0.95, "Contains paywall phrase"

    # Hard paywall sources - assume paywalled unless clearly free
    if source_id in HARD_PAYWALL_SOURCES:
        # Check for "free to read" indicators
        free_indicators = [
            r"free\s+to\s+read",
            r"no\s+subscription",
            r"open\s+access",
        ]
        for pattern in free_indicators:
            if re.search(pattern, text_to_check, re.IGNORECASE):
                return False, 0.7, "Hard paywall source but marked as free"

        return True, 0.85, "Source has hard paywall"

    # Metered paywall sources - check for truncation signals
    if source_id in METERED_PAYWALL_SOURCES:
        # Very short summary often indicates paywall
        if len(summary) < 100:
            return True, 0.7, "Metered source with very short summary"

        # Check for truncation indicators
        if TRUNCATION_PATTERN.search(summary):
            return True, 0.75, "Metered source with truncated content"

        # If summary seems complete, likely free article
        if len(summary) > 300 and not TRUNCATION_PATTERN.search(summary):
            return False, 0.6, "Metered source with full summary"

        # Default for metered sources - slight lean toward paywalled
        return True, 0.55, "Metered source (uncertain)"

    # Unknown source - check content signals only
    if TRUNCATION_PATTERN.search(summary) and len(summary) < 150:
        return True, 0.5, "Unknown source with truncated content"

    return False, 0.5, "No paywall indicators detected"


def is_paywalled(source_id: str, title: str, summary: str) -> bool:
    """
    Simple boolean check for paywall status.

    Uses a threshold of 0.6 confidence for marking as paywalled.
    """
    paywalled, confidence, _ = detect_paywall(source_id, title, summary)
    return paywalled and confidence >= 0.6