RagDocs MCP Server
- src
- tools
import { URL } from 'url';
export class URLProcessingError extends Error {
constructor(message: string) {
super(message);
this.name = 'URLProcessingError';
}
}
export interface ProcessedURL {
originalUrl: string;
normalizedUrl: string;
domain: string;
path: string;
isValid: boolean;
}
export class URLProcessor {
/**
* Validates and normalizes a URL, extracting key components
* @param urlString The URL string to process
* @returns ProcessedURL object containing normalized URL and metadata
* @throws URLProcessingError if URL is invalid
*/
static processURL(urlString: string): ProcessedURL {
try {
// Trim whitespace and normalize
const trimmedUrl = urlString.trim();
// Add protocol if missing
const urlWithProtocol = trimmedUrl.startsWith('http')
? trimmedUrl
: `https://${trimmedUrl}`;
// Parse URL
const url = new URL(urlWithProtocol);
// Normalize URL
// - Convert to lowercase
// - Remove trailing slashes
// - Remove default ports
// - Sort query parameters
const normalizedUrl = this.normalizeURL(url);
return {
originalUrl: urlString,
normalizedUrl,
domain: url.hostname.toLowerCase(),
path: url.pathname,
isValid: true,
};
} catch (error) {
throw new URLProcessingError(
`Invalid URL "${urlString}": ${(error as Error).message}`
);
}
}
/**
* Normalizes a URL to ensure consistent format
* @param url URL object to normalize
* @returns Normalized URL string
*/
private static normalizeURL(url: URL): string {
// Convert hostname to lowercase
const hostname = url.hostname.toLowerCase();
// Remove default ports
const port = url.port === '80' || url.port === '443' ? '' : url.port;
// Sort query parameters
const searchParams = new URLSearchParams([...url.searchParams].sort());
const search = searchParams.toString();
// Construct normalized path (remove trailing slash except for root)
let path = url.pathname;
if (path.length > 1 && path.endsWith('/')) {
path = path.slice(0, -1);
}
// Construct normalized URL
let normalizedUrl = `${url.protocol}//${hostname}`;
if (port) normalizedUrl += `:${port}`;
normalizedUrl += path;
if (search) normalizedUrl += `?${search}`;
if (url.hash) normalizedUrl += url.hash;
return normalizedUrl;
}
/**
* Checks if a URL points to a valid web page
* @param urlString URL to validate
* @returns true if URL is valid and accessible
*/
static isValidWebPage(urlString: string): boolean {
try {
const { protocol } = new URL(urlString);
return protocol === 'http:' || protocol === 'https:';
} catch {
return false;
}
}
/**
* Extracts the root domain from a URL
* @param urlString URL to process
* @returns Root domain string
*/
static extractRootDomain(urlString: string): string {
try {
const { hostname } = new URL(urlString);
const parts = hostname.split('.');
if (parts.length <= 2) return hostname;
// Handle special cases like co.uk, com.au
const sld = parts[parts.length - 2];
const tld = parts[parts.length - 1];
if (sld.length <= 3 && tld.length <= 3 && parts.length > 2) {
return parts.slice(-3).join('.');
}
return parts.slice(-2).join('.');
} catch {
throw new URLProcessingError(`Cannot extract domain from invalid URL: ${urlString}`);
}
}
}