guess-datetime.ts•42.9 kB
// Utility functions for guessing datetime from web pages
// Refactored from Cloudflare Worker for MCP tool usage
// Improved parseDate function to handle more date formats
function parseDate(dateStr: string): Date | null {
if (!dateStr) return null;
// Clean up the string (remove extra spaces, normalize separators)
let cleanStr = dateStr.trim()
.replace(/\s+/g, ' ')
.replace(/-(\d{2}:)/, ' $1'); // Fix formats like 2025-03-05-21:25:00
// Try direct parsing first
const date = new Date(cleanStr);
if (!isNaN(date.getTime())) return date;
// Try parsing ISO-like formats with variations
const isoPattern = /(\d{4})[-\/](\d{1,2})[-\/](\d{1,2})(?:[T\s-](\d{1,2})[:\.](\d{1,2})(?:[:\.](\d{1,2}))?)?/;
const isoMatch = cleanStr.match(isoPattern);
if (isoMatch) {
const year = parseInt(isoMatch[1]);
const month = parseInt(isoMatch[2]) - 1; // JS months are 0-indexed
const day = parseInt(isoMatch[3]);
const hour = isoMatch[4] ? parseInt(isoMatch[4]) : 0;
const minute = isoMatch[5] ? parseInt(isoMatch[5]) : 0;
const second = isoMatch[6] ? parseInt(isoMatch[6]) : 0;
const newDate = new Date(year, month, day, hour, minute, second);
if (!isNaN(newDate.getTime())) return newDate;
}
// Try MM/DD/YYYY and DD/MM/YYYY formats
const slashPattern = /(\d{1,2})[\/\-\.](\d{1,2})[\/\-\.](\d{4})/;
const slashMatch = cleanStr.match(slashPattern);
if (slashMatch) {
// Try both MM/DD/YYYY and DD/MM/YYYY interpretations
const parts = [parseInt(slashMatch[1]), parseInt(slashMatch[2]), parseInt(slashMatch[3])];
// MM/DD/YYYY attempt
const usDate = new Date(parts[2], parts[0] - 1, parts[1]);
if (!isNaN(usDate.getTime()) && usDate.getMonth() === parts[0] - 1 && usDate.getDate() === parts[1]) {
return usDate;
}
// DD/MM/YYYY attempt
const euDate = new Date(parts[2], parts[1] - 1, parts[0]);
if (!isNaN(euDate.getTime()) && euDate.getMonth() === parts[1] - 1 && euDate.getDate() === parts[0]) {
return euDate;
}
}
// Try month name patterns
const monthNamePattern = /([A-Za-z]+)\s+(\d{1,2})(?:st|nd|rd|th)?,?\s+(\d{4})/i;
const monthMatch = cleanStr.match(monthNamePattern);
if (monthMatch) {
const newDate = new Date(`${monthMatch[1]} ${monthMatch[2]}, ${monthMatch[3]}`);
if (!isNaN(newDate.getTime())) return newDate;
}
// Try international date formats
// Chinese: YYYY年MM月DD日
const chinesePattern = /(\d{4})年(\d{1,2})月(\d{1,2})日/;
const chineseMatch = cleanStr.match(chinesePattern);
if (chineseMatch) {
const year = parseInt(chineseMatch[1]);
const month = parseInt(chineseMatch[2]) - 1;
const day = parseInt(chineseMatch[3]);
const newDate = new Date(year, month, day);
if (!isNaN(newDate.getTime())) return newDate;
}
// Japanese: YYYY年MM月DD日
const japanesePattern = /(\d{4})年(\d{1,2})月(\d{1,2})日/;
const japaneseMatch = cleanStr.match(japanesePattern);
if (japaneseMatch) {
const year = parseInt(japaneseMatch[1]);
const month = parseInt(japaneseMatch[2]) - 1;
const day = parseInt(japaneseMatch[3]);
const newDate = new Date(year, month, day);
if (!isNaN(newDate.getTime())) return newDate;
}
// European: DD.MM.YYYY
const europeanPattern = /(\d{1,2})\.(\d{1,2})\.(\d{4})/;
const europeanMatch = cleanStr.match(europeanPattern);
if (europeanMatch) {
const day = parseInt(europeanMatch[1]);
const month = parseInt(europeanMatch[2]) - 1;
const year = parseInt(europeanMatch[3]);
const newDate = new Date(year, month, day);
if (!isNaN(newDate.getTime())) return newDate;
}
// Korean: YYYY-MM-DD
const koreanPattern = /(\d{4})-(\d{1,2})-(\d{1,2})/;
const koreanMatch = cleanStr.match(koreanPattern);
if (koreanMatch) {
const year = parseInt(koreanMatch[1]);
const month = parseInt(koreanMatch[2]) - 1;
const day = parseInt(koreanMatch[3]);
const newDate = new Date(year, month, day);
if (!isNaN(newDate.getTime())) return newDate;
}
// Try Unix timestamps (seconds or milliseconds)
if (/^\d+$/.test(cleanStr)) {
const timestamp = parseInt(cleanStr);
// If the number is too small to be a millisecond timestamp but could be seconds
const date = new Date(timestamp > 9999999999 ? timestamp : timestamp * 1000);
if (!isNaN(date.getTime()) && date.getFullYear() > 1970 && date.getFullYear() < 2100) {
return date;
}
}
return null;
}
// Extract Schema.org timestamps
function extractSchemaOrgTimestamps(html: string): Array<{
type: string;
field: string;
date: string;
priority: string;
context: string;
}> {
const results: Array<{
type: string;
field: string;
date: string;
priority: string;
context: string;
}> = [];
// Find JSON+LD scripts with Schema.org data
const schemaPattern = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
let schemaMatch;
while ((schemaMatch = schemaPattern.exec(html)) !== null) {
try {
const jsonData = JSON.parse(schemaMatch[1]);
// Process array of schemas or single schema object
const schemas = Array.isArray(jsonData) ? jsonData : [jsonData];
for (const schema of schemas) {
// Handle nested graphs
if (schema['@graph'] && Array.isArray(schema['@graph'])) {
for (const item of schema['@graph']) {
extractDatesFromSchema(item, results);
}
} else {
extractDatesFromSchema(schema, results);
}
}
} catch (e) {
// Skip invalid JSON
}
}
return results;
}
// Helper function to extract dates from schema objects
function extractDatesFromSchema(schema: any, results: Array<{
type: string;
field: string;
date: string;
priority: string;
context: string;
}>) {
const dateProperties = [
'dateModified',
'dateUpdated',
'datePublished',
'dateCreated',
'uploadDate',
'lastReviewed'
];
for (const prop of dateProperties) {
if (schema[prop]) {
const date = parseDate(schema[prop]);
if (date) {
results.push({
type: 'schemaOrg',
field: prop,
date: date.toISOString(),
priority: prop === 'dateModified' ? 'high' :
prop === 'dateUpdated' ? 'high' : 'medium',
context: `Schema.org ${schema['@type'] || 'object'}`
});
}
}
}
// Check for nested objects that might contain dates
if (schema.mainEntity) {
extractDatesFromSchema(schema.mainEntity, results);
}
// Handle Article specific schema
if (schema['@type'] === 'Article' && schema.author) {
const authorObj = typeof schema.author === 'object' ? schema.author : {};
for (const prop of dateProperties) {
if (authorObj[prop]) {
const date = parseDate(authorObj[prop]);
if (date) {
results.push({
type: 'schemaOrg',
field: `author.${prop}`,
date: date.toISOString(),
priority: 'medium',
context: 'Article author'
});
}
}
}
}
}
// Extract HTML comments that might contain version info
function extractHtmlComments(html: string): Array<{
type: string;
date: string;
context: string;
version?: string;
}> {
const results: Array<{
type: string;
date: string;
context: string;
version?: string;
}> = [];
const commentPattern = /<!--([\s\S]*?)-->/g;
let commentMatch;
while ((commentMatch = commentPattern.exec(html)) !== null) {
const comment = commentMatch[1];
// Look for version patterns
const versionPattern = /(?:version|v|revision|rev|updated|modified|timestamp)[\s:=]+([0-9.]+|\d{4}-\d{2}-\d{2}|\d{2}\/\d{2}\/\d{4})/i;
const versionMatch = comment.match(versionPattern);
if (versionMatch) {
// Try to parse as date
const date = parseDate(versionMatch[1]);
if (date) {
results.push({
type: 'htmlComment',
date: date.toISOString(),
context: comment.trim().substring(0, 100),
version: versionMatch[1]
});
} else if (/\d{4}-\d{2}-\d{2}/.test(versionMatch[1])) {
// Looks like a date but couldn't parse, try manual parsing
const parts = versionMatch[1].split(/[-\/]/);
if (parts.length === 3 && parts[0].length === 4) {
const date = new Date(`${parts[0]}-${parts[1]}-${parts[2]}`);
if (!isNaN(date.getTime())) {
results.push({
type: 'htmlComment',
date: date.toISOString(),
context: comment.trim().substring(0, 100)
});
}
}
}
}
// Look for date patterns
const datePattern = /(\d{4}-\d{2}-\d{2}|\d{2}\/\d{2}\/\d{4}|\d{1,2} [A-Za-z]+ \d{4})/;
const dateMatch = comment.match(datePattern);
if (dateMatch && !versionMatch) {
const date = parseDate(dateMatch[1]);
if (date) {
results.push({
type: 'htmlComment',
date: date.toISOString(),
context: comment.trim().substring(0, 100)
});
}
}
}
return results;
}
// Check Git blame info (sometimes exposed in HTML comments)
function extractGitInfo(html: string): {
gitHash?: string;
gitDate?: string;
deployDate?: string;
} {
const results: {
gitHash?: string;
gitDate?: string;
deployDate?: string;
} = {};
// Look for Git hash in comments
const gitHashPattern = /<!--[\s\S]*?(?:commit|hash|git)[:\s]+([a-f0-9]{7,40})[\s\S]*?-->/i;
const gitHashMatch = html.match(gitHashPattern);
if (gitHashMatch) {
results.gitHash = gitHashMatch[1];
// Look for date near the hash
const nearbyDatePattern = new RegExp(`<!--[\\s\\S]*?${gitHashMatch[1]}[\\s\\S]*?(\\d{4}-\\d{2}-\\d{2}|\\d{2}/\\d{2}/\\d{4})[\\s\\S]*?-->`, 'i');
const nearbyDateMatch = html.match(nearbyDatePattern);
if (nearbyDateMatch) {
const date = parseDate(nearbyDateMatch[1]);
if (date) {
results.gitDate = date.toISOString();
}
}
}
// Look for GitLab/GitHub deploy comments
const deployPattern = /<!--[\s\S]*?(?:deployed|deployment|deploy)[\s\S]*?((?:\d{4}-\d{2}-\d{2})|(?:\d{2}\/\d{2}\/\d{4}))[\\s\\S]*?-->/i;
const deployMatch = html.match(deployPattern);
if (deployMatch) {
const date = parseDate(deployMatch[1]);
if (date) {
results.deployDate = date.toISOString();
}
}
return results;
}
// Extract meta update times
function extractMetaUpdateTimes(html: string): {
lastModified?: string;
publishedDate?: string;
articleModified?: string;
articlePublished?: string;
pageGenerated?: string;
ogUpdatedTime?: string;
lastmod?: string;
generated?: string;
build?: string;
revision?: string;
version?: string;
} {
const results: {
lastModified?: string;
publishedDate?: string;
articleModified?: string;
articlePublished?: string;
pageGenerated?: string;
ogUpdatedTime?: string;
lastmod?: string;
generated?: string;
build?: string;
revision?: string;
version?: string;
} = {};
// More flexible meta tag pattern matching (attribute order independent)
const metaTimePattern = /<meta\s+(?:[^>]*?\s+)?(?:name|property)=["']([^"']+)["'](?:[^>]*?\s+)?content=["']([^"']+)["']|<meta\s+(?:[^>]*?\s+)?content=["']([^"']+)["'](?:[^>]*?\s+)?(?:name|property)=["']([^"']+)["']/gi;
let match;
while ((match = metaTimePattern.exec(html)) !== null) {
// Handle both attribute orders
const name = match[1] || match[4];
const content = match[2] || match[3];
if (!name || !content) continue;
// Check for various time-related meta tags
if (/last[-_]?modified|modified[-_]?time|update[-_]?time|date[-_]?modified|modified|revision/i.test(name)) {
const date = parseDate(content);
if (date) {
results.lastModified = date.toISOString();
}
}
else if (/published[-_]?time|pub[-_]?date|date[-_]?published|creation[-_]?date|firstpublishedtime/i.test(name)) {
const date = parseDate(content);
if (date) {
results.publishedDate = date.toISOString();
}
}
else if (/article:modified_time|og:updated_time/i.test(name)) {
const date = parseDate(content);
if (date) {
results.articleModified = date.toISOString();
}
}
else if (/article:published_time|og:published_time/i.test(name)) {
const date = parseDate(content);
if (date) {
results.articlePublished = date.toISOString();
}
}
else if (/og:updated_time/i.test(name)) {
const date = parseDate(content);
if (date) {
results.ogUpdatedTime = date.toISOString();
}
}
else if (/lastmod|last[-_]?mod/i.test(name)) {
const date = parseDate(content);
if (date) {
results.lastmod = date.toISOString();
}
}
else if (/generated|gen[-_]?date|build[-_]?date/i.test(name)) {
const date = parseDate(content);
if (date) {
results.generated = date.toISOString();
}
}
else if (/build|build[-_]?time|build[-_]?date/i.test(name)) {
const date = parseDate(content);
if (date) {
results.build = date.toISOString();
}
}
else if (/revision|rev/i.test(name)) {
const date = parseDate(content);
if (date) {
results.revision = date.toISOString();
}
}
else if (/version|ver/i.test(name) && /\d{4}[-\/]\d{1,2}[-\/]\d{1,2}/.test(content)) {
// Extract date from version content that might contain other text
const dateMatch = content.match(/(\d{4}[-\/]\d{1,2}[-\/]\d{1,2}(?:\s+\d{1,2}:\d{1,2}(?::\d{1,2})?)?)/);
if (dateMatch) {
const date = parseDate(dateMatch[1]);
if (date) {
results.version = date.toISOString();
}
}
}
else if (/page[-_]?generated[-_]?time|gendate|generated|others/i.test(name) && /\d{4}[-\/]\d{1,2}[-\/]\d{1,2}/.test(content)) {
// Extract date from content that might contain other text
const dateMatch = content.match(/(\d{4}[-\/]\d{1,2}[-\/]\d{1,2}(?:\s+\d{1,2}:\d{1,2}(?::\d{1,2})?)?)/);
if (dateMatch) {
const date = parseDate(dateMatch[1]);
if (date) {
results.pageGenerated = date.toISOString();
}
}
}
}
return results;
}
// Extract visible dates from HTML content
function extractVisibleDates(html: string): Array<{
type: string;
date: string;
context: string;
priority: string;
}> {
const results: Array<{
type: string;
date: string;
context: string;
priority: string;
}> = [];
// Common date indicator classes
const dateClassPattern = /<(?:div|span|p)\s+class=["'](?:[^"']*\s+)?(?:date|time|timestamp|pubdate|updated|modified|posted-on|entry-date|publish-date|post-date)[^"']*["'][^>]*>([^<]+)/gi;
let match;
while ((match = dateClassPattern.exec(html)) !== null) {
const content = match[1].trim();
// Check if content looks date-like
if (/\d{4}/.test(content)) {
const date = parseDate(content);
if (date) {
results.push({
type: 'dateClass',
date: date.toISOString(),
context: content,
priority: 'medium'
});
}
}
}
// Extract dates from time elements (more reliable)
const timePattern = /<time(?:\s+[^>]*)?\s+datetime=["']([^"']+)["'][^>]*>.*?<\/time>/gi;
while ((match = timePattern.exec(html)) !== null) {
const datetime = match[1];
const date = parseDate(datetime);
if (date) {
results.push({
type: 'timeElement',
date: date.toISOString(),
context: match[0].substring(0, 100),
priority: 'high' // Time elements are usually more reliable
});
}
}
// Common date patterns in text
const dateFormatPatterns = [
// ISO 8601
/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?(?:Z|[+-]\d{2}:\d{2})/g,
// Common date formats
/\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}(?::\d{2})?/g,
/\d{2}\/\d{2}\/\d{4}\s+\d{2}:\d{2}(?::\d{2})?/g,
// Date only formats
/\d{4}-\d{2}-\d{2}/g,
/\d{2}\/\d{2}\/\d{4}/g
];
for (const pattern of dateFormatPatterns) {
let dateMatch;
while ((dateMatch = pattern.exec(html)) !== null) {
const date = parseDate(dateMatch[0]);
if (date) {
results.push({
type: 'visibleDate',
date: date.toISOString(),
context: html.substring(Math.max(0, dateMatch.index - 50),
dateMatch.index + dateMatch[0].length + 50),
priority: 'medium'
});
}
}
}
// Update phrases with more variants
const updatePhrasePattern = /(?:updated|last modified|modified|revised|last updated|posted|published)(?:\s*(?:on|at|date|time))?[::]\s*([^<\n\r]{5,30})/gi;
while ((match = updatePhrasePattern.exec(html)) !== null) {
const dateStr = match[1].trim();
const date = parseDate(dateStr);
if (date) {
results.push({
type: 'updatePhrase',
date: date.toISOString(),
context: match[0],
priority: 'high'
});
}
}
return results;
}
// Extract JavaScript timestamps
function extractJavaScriptTimestamps(html: string): Array<{
type: string;
date: string;
context: string;
field?: string;
priority?: string;
}> {
const results: Array<{
type: string;
date: string;
context: string;
field?: string;
priority?: string;
}> = [];
// Find timestamps in script tags
const scriptPattern = /<script[^>]*>([\s\S]*?)<\/script>/gi;
let scriptMatch;
while ((scriptMatch = scriptPattern.exec(html)) !== null) {
const scriptContent = scriptMatch[1];
// Look for timestamp variables
const timestampPatterns = [
/(?:last_?(?:updated|modified)|modified_?(?:date|time)|update_?(?:date|time)|published_?(?:date|time))\s*[=:]\s*['"]?(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d{3})?(?:Z|[+-]\d{2}:\d{2})?)['"]?/i,
/(?:last_?(?:updated|modified)|modified_?(?:date|time)|update_?(?:date|time)|published_?(?:date|time))\s*[=:]\s*new Date\(["']?([^)]+)["']?\)/i,
/(?:last_?(?:updated|modified)|modified_?(?:date|time)|update_?(?:date|time)|published_?(?:date|time))\s*[=:]\s*(\d+)/i // Unix timestamp
];
for (const pattern of timestampPatterns) {
const match = scriptContent.match(pattern);
if (match && match[1]) {
let date;
if (/^\d+$/.test(match[1])) {
// Handle Unix timestamps (in seconds or milliseconds)
const timestamp = parseInt(match[1]);
date = new Date(timestamp > 9999999999 ? timestamp : timestamp * 1000);
} else {
date = new Date(match[1]);
}
if (!isNaN(date.getTime())) {
results.push({
type: 'jsTimestamp',
date: date.toISOString(),
context: match[0],
priority: 'medium'
});
}
}
}
// Look for data objects with date properties
const dataObjectPattern = /(?:article|page|post|document|content|data)(?:Data)?\s*[=:]\s*\{[\s\S]*?(?:updated|modified|published|date)(?:At|On|Date|Time)?\s*[=:]\s*["']?([^,"'\}\s]+)["']?/i;
const dataObjectMatch = scriptContent.match(dataObjectPattern);
if (dataObjectMatch && dataObjectMatch[1]) {
const date = new Date(dataObjectMatch[1]);
if (!isNaN(date.getTime())) {
results.push({
type: 'jsDataObject',
date: date.toISOString(),
context: dataObjectMatch[0].substring(0, 100),
priority: 'medium'
});
}
}
}
// Look for JSON-LD scripts which often contain date information
const jsonLdPattern = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
let jsonLdMatch;
while ((jsonLdMatch = jsonLdPattern.exec(html)) !== null) {
try {
const jsonLd = JSON.parse(jsonLdMatch[1]);
// Extract dates from JSON-LD
const dateFields = ['dateModified', 'dateUpdated', 'datePublished', 'uploadDate'];
for (const field of dateFields) {
if (jsonLd[field]) {
const date = new Date(jsonLd[field]);
if (!isNaN(date.getTime())) {
results.push({
type: 'jsonLd',
field: field,
date: date.toISOString(),
priority: field === 'dateModified' ? 'high' : 'medium',
context: `JSON-LD ${field}`
});
}
}
}
} catch (e) {
// Invalid JSON, skip this script
}
}
return results;
}
// Extract RSS/Atom feed timestamps
async function extractFeedTimestamps(targetUrl: string): Promise<Array<{
type: string;
date: string;
context: string;
priority: string;
}>> {
const results: Array<{
type: string;
date: string;
context: string;
priority: string;
}> = [];
try {
const urlObj = new URL(targetUrl);
const baseUrl = `${urlObj.protocol}//${urlObj.hostname}`;
// Common feed URLs to check
const feedUrls = [
`${baseUrl}/feed`,
`${baseUrl}/rss`,
`${baseUrl}/atom`,
`${baseUrl}/feed.xml`,
`${baseUrl}/rss.xml`,
`${baseUrl}/atom.xml`,
`${baseUrl}/sitemap.xml`
];
for (const feedUrl of feedUrls) {
try {
const response = await fetch(feedUrl);
if (!response.ok) continue;
const text = await response.text();
// Check for RSS feed
if (text.includes('<rss') || text.includes('<rdf')) {
// Look for lastBuildDate in RSS
const lastBuildMatch = text.match(/<lastBuildDate>([^<]+)<\/lastBuildDate>/i);
if (lastBuildMatch) {
const date = parseDate(lastBuildMatch[1]);
if (date) {
results.push({
type: 'rssLastBuild',
date: date.toISOString(),
context: `RSS feed: ${feedUrl}`,
priority: 'high'
});
}
}
// Look for pubDate in RSS items
const pubDateMatch = text.match(/<pubDate>([^<]+)<\/pubDate>/i);
if (pubDateMatch) {
const date = parseDate(pubDateMatch[1]);
if (date) {
results.push({
type: 'rssPubDate',
date: date.toISOString(),
context: `RSS pubDate: ${feedUrl}`,
priority: 'medium'
});
}
}
}
// Check for Atom feed
else if (text.includes('<feed')) {
// Look for updated in Atom
const updatedMatch = text.match(/<updated>([^<]+)<\/updated>/i);
if (updatedMatch) {
const date = parseDate(updatedMatch[1]);
if (date) {
results.push({
type: 'atomUpdated',
date: date.toISOString(),
context: `Atom feed: ${feedUrl}`,
priority: 'high'
});
}
}
}
// Check for sitemap
else if (text.includes('<urlset')) {
// Look for lastmod in sitemap
const lastmodMatch = text.match(/<lastmod>([^<]+)<\/lastmod>/i);
if (lastmodMatch) {
const date = parseDate(lastmodMatch[1]);
if (date) {
results.push({
type: 'sitemapLastmod',
date: date.toISOString(),
context: `Sitemap: ${feedUrl}`,
priority: 'high'
});
}
}
}
} catch (e) {
// Skip failed feed requests
continue;
}
}
} catch (e) {
// Skip if URL parsing fails
}
return results;
}
// Extract all time indicators from a web page
async function extractAllTimeIndicators(response: Response, html: string, targetUrl: string) {
const result: any = {
// Standard HTTP headers
lastModified: response.headers.get('last-modified') || null,
etag: response.headers.get('etag') || null,
date: response.headers.get('date') || null,
expires: response.headers.get('expires') || null,
// HTML metadata
metaTags: extractMetaUpdateTimes(html),
// HTML content-based timestamps
visibleDates: extractVisibleDates(html),
// JavaScript timestamps
jsTimestamps: extractJavaScriptTimestamps(html),
// Schema.org timestamps
schemaOrgTimestamps: extractSchemaOrgTimestamps(html),
// HTML comments extraction
htmlComments: extractHtmlComments(html),
// Git info extraction
gitInfo: extractGitInfo(html),
// Server time
serverTime: response.headers.get('date') || null,
};
// Add feed timestamps (RSS, Atom, Sitemap)
try {
result.feedTimestamps = await extractFeedTimestamps(targetUrl);
} catch (e) {
result.feedTimestamps = [];
}
return result;
}
// Determine the best update time from all available indicators
function determineBestUpdateTime(updateTimes: any) {
// First check for meta tags that explicitly indicate last modified time
if (updateTimes.metaTags && updateTimes.metaTags.lastModified) {
return {
timestamp: updateTimes.metaTags.lastModified,
confidence: 95,
reasoning: ["Explicit lastmodifiedtime meta tag"]
};
}
// Check other meta tags related to publication/modification
if (updateTimes.metaTags) {
if (updateTimes.metaTags.articleModified) {
return {
timestamp: updateTimes.metaTags.articleModified,
confidence: 90,
reasoning: ["Article modified time meta tag"]
};
}
if (updateTimes.metaTags.publishedDate) {
return {
timestamp: updateTimes.metaTags.publishedDate,
confidence: 85,
reasoning: ["Published date meta tag"]
};
}
// Check for new high-value meta tags
if (updateTimes.metaTags.ogUpdatedTime) {
return {
timestamp: updateTimes.metaTags.ogUpdatedTime,
confidence: 88,
reasoning: ["Open Graph updated time meta tag"]
};
}
if (updateTimes.metaTags.lastmod) {
return {
timestamp: updateTimes.metaTags.lastmod,
confidence: 87,
reasoning: ["Last modified meta tag"]
};
}
if (updateTimes.metaTags.generated) {
return {
timestamp: updateTimes.metaTags.generated,
confidence: 85,
reasoning: ["Page generated meta tag"]
};
}
if (updateTimes.metaTags.build) {
return {
timestamp: updateTimes.metaTags.build,
confidence: 83,
reasoning: ["Build timestamp meta tag"]
};
}
}
// Check feed timestamps (RSS, Atom, Sitemap) - often very reliable
if (updateTimes.feedTimestamps && updateTimes.feedTimestamps.length > 0) {
// Filter for high priority feed timestamps
const highPriorityFeeds = updateTimes.feedTimestamps
.filter((stamp: any) => stamp.priority === 'high')
.map((stamp: any) => ({ date: new Date(stamp.date), type: stamp.type, context: stamp.context }));
if (highPriorityFeeds.length > 0) {
// Sort by recency
highPriorityFeeds.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: highPriorityFeeds[0].date.toISOString(),
confidence: 92,
reasoning: ["Feed timestamp", `Type: ${highPriorityFeeds[0].type}`, `Context: ${highPriorityFeeds[0].context}`]
};
}
// If no high priority feeds, use most recent feed timestamp
const allFeedDates = updateTimes.feedTimestamps
.map((stamp: any) => ({ date: new Date(stamp.date), type: stamp.type, context: stamp.context }));
allFeedDates.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: allFeedDates[0].date.toISOString(),
confidence: 85,
reasoning: ["Feed timestamp", `Type: ${allFeedDates[0].type}`, `Context: ${allFeedDates[0].context}`]
};
}
// Check visible dates with high priority markers
if (updateTimes.visibleDates && updateTimes.visibleDates.length > 0) {
// Look for dates that appear to be part of lastmodified content
const contentDates = updateTimes.visibleDates.filter((d: any) => {
const ctx = d.context.toLowerCase();
return ctx.includes('lastmodified') ||
ctx.includes('last modified') ||
ctx.includes('updated') ||
ctx.includes('修改') || // Chinese for "modified"
ctx.includes('更新'); // Chinese for "updated"
});
if (contentDates.length > 0) {
// Sort by recency
const dates = contentDates.map((d: any) => new Date(d.date));
dates.sort((a: Date, b: Date) => {
if (!a || !b) return 0;
return b.getTime() - a.getTime();
});
return {
timestamp: dates[0].toISOString(),
confidence: 92,
reasoning: ["Content explicitly marked as modified/updated"]
};
}
// Next check for dates that appear in common date display elements
const displayDateElements = updateTimes.visibleDates.filter((d: any) => {
const ctx = d.context.toLowerCase();
return ctx.includes('class="date') ||
ctx.includes('class="time') ||
ctx.includes('class="pubdate') ||
ctx.includes('class="published') ||
ctx.includes('pages-date') ||
ctx.includes('pub-date');
});
if (displayDateElements.length > 0) {
const dates = displayDateElements.map((d: any) => new Date(d.date));
dates.sort((a: Date, b: Date) => b.getTime() - a.getTime());
return {
timestamp: dates[0].toISOString(),
confidence: 88,
reasoning: ["Date from primary content display element"]
};
}
}
// Check for Schema.org timestamps
if (updateTimes.schemaOrgTimestamps && updateTimes.schemaOrgTimestamps.length > 0) {
// Filter for high priority fields: dateModified and dateUpdated
const highPriorityDates = updateTimes.schemaOrgTimestamps
.filter((stamp: any) => stamp.priority === 'high')
.map((stamp: any) => ({ date: new Date(stamp.date), field: stamp.field, context: stamp.context }));
if (highPriorityDates.length > 0) {
// Sort by recency
highPriorityDates.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: highPriorityDates[0].date.toISOString(),
confidence: 85,
reasoning: ["Schema.org structured data", `Field: ${highPriorityDates[0].field}`, `Context: ${highPriorityDates[0].context}`]
};
}
// If no high priority fields, use most recent Schema.org date
const allSchemaDates = updateTimes.schemaOrgTimestamps
.map((stamp: any) => ({ date: new Date(stamp.date), field: stamp.field, context: stamp.context }));
allSchemaDates.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: allSchemaDates[0].date.toISOString(),
confidence: 75,
reasoning: ["Schema.org structured data", `Field: ${allSchemaDates[0].field}`, `Context: ${allSchemaDates[0].context}`]
};
}
// Check Git info (often very reliable)
if (updateTimes.gitInfo && updateTimes.gitInfo.gitDate) {
return {
timestamp: updateTimes.gitInfo.gitDate,
confidence: 90,
reasoning: ["Git commit information", updateTimes.gitInfo.gitHash ? `Git hash: ${updateTimes.gitInfo.gitHash}` : ""]
};
} else if (updateTimes.gitInfo && updateTimes.gitInfo.deployDate) {
return {
timestamp: updateTimes.gitInfo.deployDate,
confidence: 88,
reasoning: ["Git deployment timestamp"]
};
}
// JSON-LD structured data is also quite reliable
if (updateTimes.jsTimestamps && updateTimes.jsTimestamps.length > 0) {
const jsonLdDates = updateTimes.jsTimestamps
.filter((stamp: any) => stamp.type === 'jsonLd')
.map((stamp: any) => ({
date: new Date(stamp.date),
field: stamp.field,
priority: stamp.priority
}));
if (jsonLdDates.length > 0) {
// Sort by priority and recency
jsonLdDates.sort((a: any, b: any) => {
if (a.priority === 'high' && b.priority !== 'high') return -1;
if (a.priority !== 'high' && b.priority === 'high') return 1;
return b.date.getTime() - a.date.getTime();
});
return {
timestamp: jsonLdDates[0].date.toISOString(),
confidence: jsonLdDates[0].priority === 'high' ? 80 : 65,
reasoning: [`JSON-LD structured data (${jsonLdDates[0].field})`]
};
}
}
// If we have a page generation time meta tag, it's a decent indicator
if (updateTimes.metaTags && updateTimes.metaTags.pageGenerated) {
return {
timestamp: updateTimes.metaTags.pageGenerated,
confidence: 75,
reasoning: ["Page generation time meta tag"]
};
}
// Process visible dates that don't have explicit modification indicators
if (updateTimes.visibleDates && updateTimes.visibleDates.length > 0) {
// Get all dates and sort by recency
const allDates = updateTimes.visibleDates.map((d: any) => ({
date: new Date(d.date),
context: d.context
}));
allDates.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: allDates[0].date.toISOString(),
confidence: 70,
reasoning: ["Most recent date found in page content", `Context: "${allDates[0].context}"`]
};
}
// Try HTML comments
if (updateTimes.htmlComments && updateTimes.htmlComments.length > 0) {
const commentDates = updateTimes.htmlComments.map((c: any) => ({
date: new Date(c.date),
context: c.context
}));
commentDates.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: commentDates[0].date.toISOString(),
confidence: 60,
reasoning: ["Timestamp from HTML comment", `Context: "${commentDates[0].context}"`]
};
}
// Try JavaScript timestamps
if (updateTimes.jsTimestamps && updateTimes.jsTimestamps.length > 0) {
const jsDates = updateTimes.jsTimestamps
.filter((stamp: any) => stamp.type !== 'jsonLd')
.map((stamp: any) => ({
date: new Date(stamp.date),
context: stamp.context,
type: stamp.type
}));
if (jsDates.length > 0) {
// Sort by recency
jsDates.sort((a: any, b: any) => b.date.getTime() - a.date.getTime());
return {
timestamp: jsDates[0].date.toISOString(),
confidence: 60,
reasoning: ["JavaScript timestamp found", `Context: "${jsDates[0].context}"`]
};
}
}
// Use HTTP Last-Modified even if it matches server time, but with lower confidence
if (updateTimes.lastModified) {
const lastModDate = new Date(updateTimes.lastModified);
if (!isNaN(lastModDate.getTime())) {
// Check if Last-Modified differs significantly from server time
if (updateTimes.date) {
const serverDate = new Date(updateTimes.date);
const timeDiff = Math.abs(lastModDate.getTime() - serverDate.getTime());
if (timeDiff > 60000) { // More than 1 minute difference
return {
timestamp: lastModDate.toISOString(),
confidence: 75,
reasoning: ["HTTP Last-Modified header differs significantly from server time", `Difference: ${Math.round(timeDiff / 1000 / 60)} minutes`]
};
} else if (timeDiff > 1000) { // More than 1 second difference
return {
timestamp: lastModDate.toISOString(),
confidence: 65,
reasoning: ["HTTP Last-Modified header differs from server time", `Difference: ${Math.round(timeDiff / 1000)} seconds`]
};
} else {
return {
timestamp: lastModDate.toISOString(),
confidence: 40,
reasoning: ["HTTP Last-Modified header (may be server time)"]
};
}
} else {
return {
timestamp: lastModDate.toISOString(),
confidence: 60,
reasoning: ["HTTP Last-Modified header"]
};
}
}
}
// If all else fails, use the server date with very low confidence
if (updateTimes.date) {
return {
timestamp: new Date(updateTimes.date).toISOString(),
confidence: 10,
reasoning: ["No update time found", "Using server date as fallback"]
};
}
// Absolute fallback: unknown
return {
timestamp: null,
confidence: 0,
reasoning: ["No reliable update time indicators found"]
};
}
// Main function to guess datetime from a URL
export async function guessDatetimeFromUrl(url: string): Promise<{
bestGuess: string | null;
confidence: number;
}> {
try {
// Fetch the target webpage
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const text = await response.text();
// Extract all possible time indicators
const updateTimes = await extractAllTimeIndicators(response, text, url);
// Advanced heuristic-based determination of the "true" update time
const bestGuess = determineBestUpdateTime(updateTimes);
// Result with confidence score
const result = {
bestGuess: bestGuess.timestamp,
confidence: bestGuess.confidence
};
return result;
} catch (error) {
throw new Error(`Failed to guess datetime from URL: ${error instanceof Error ? error.message : String(error)}`);
}
}