/**
* HTML parsing utilities for Gemini API documentation.
*/
import * as cheerio from "cheerio";
import { request } from "undici";
import { USER_AGENT } from "../constants.js";
export interface DocContent {
url: string;
title: string;
description: string;
sections: Array<{
level: number;
title: string;
content: string;
}>;
codeExamples: Array<{
language: string;
code: string;
}>;
fullText: string;
}
/**
* Fetches HTML content from a URL.
*/
export async function fetchHtml(url: string): Promise<string> {
const { statusCode, body } = await request(url, {
method: "GET",
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
},
});
if (statusCode !== 200) {
const text = await body.text();
throw new Error(`Failed to fetch ${url}: ${statusCode} - ${text.substring(0, 200)}`);
}
return body.text();
}
/**
* Parses a Gemini API documentation page and extracts structured content.
*/
export function parseDocPage(html: string, url: string): DocContent {
const $ = cheerio.load(html);
// Extract title
const title = $("h1").first().text().trim() || "Untitled";
// Extract description (first paragraph after h1)
const description = $("article p").first().text().trim() ||
$(".devsite-article-body p").first().text().trim() ||
"";
// Extract sections with headings
const sections: DocContent["sections"] = [];
const article = $("article, .devsite-article-body").first();
if (article.length) {
article.find("h1, h2, h3, h4").each((_, heading) => {
const $heading = $(heading);
const level = parseInt(heading.tagName.replace("h", ""), 10);
const sectionTitle = $heading.text().trim();
// Get content until next heading
let content = "";
let nextEl = $heading.next();
while (nextEl.length && !nextEl.is("h1, h2, h3, h4")) {
if (!nextEl.is("pre, code, .devsite-code-block")) {
content += nextEl.text().trim() + "\n";
}
nextEl = nextEl.next();
}
if (sectionTitle) {
sections.push({
level,
title: sectionTitle,
content: content.trim(),
});
}
});
}
// Extract code examples
const codeExamples: DocContent["codeExamples"] = [];
$("pre code, .devsite-code-block code").each((_, codeEl) => {
const $code = $(codeEl);
const code = $code.text().trim();
// Try to detect language from class
const classes = $code.attr("class") || "";
const langMatch = classes.match(/language-(\w+)|(\w+)-code/);
const language = langMatch ? (langMatch[1] || langMatch[2]) : "text";
if (code) {
codeExamples.push({ language, code });
}
});
// Extract full text content (without code blocks for cleaner text)
const articleClone = article.clone();
articleClone.find("pre, code, script, style, nav").remove();
const fullText = articleClone.text()
.replace(/\s+/g, " ")
.trim();
return {
url,
title,
description,
sections,
codeExamples: codeExamples.slice(0, 10), // Limit code examples
fullText: fullText.substring(0, 30000), // Limit full text
};
}
/**
* Formats DocContent as Markdown.
*/
export function formatAsMarkdown(doc: DocContent): string {
const lines: string[] = [];
lines.push(`# ${doc.title}`);
lines.push("");
lines.push(`**URL**: ${doc.url}`);
lines.push("");
if (doc.description) {
lines.push(doc.description);
lines.push("");
}
lines.push("## Table of Contents");
lines.push("");
for (const section of doc.sections) {
const indent = " ".repeat(section.level - 1);
lines.push(`${indent}- ${section.title}`);
}
lines.push("");
lines.push("## Content");
lines.push("");
for (const section of doc.sections) {
const headingPrefix = "#".repeat(Math.min(section.level + 1, 6));
lines.push(`${headingPrefix} ${section.title}`);
lines.push("");
if (section.content) {
lines.push(section.content);
lines.push("");
}
}
if (doc.codeExamples.length > 0) {
lines.push("## Code Examples");
lines.push("");
for (const example of doc.codeExamples.slice(0, 5)) {
lines.push(`\`\`\`${example.language}`);
lines.push(example.code.substring(0, 2000));
lines.push("```");
lines.push("");
}
}
return lines.join("\n");
}
/**
* Sanitizes a string by removing or replacing control characters.
*/
function sanitizeString(str: string): string {
// Remove or replace control characters (except common whitespace)
return str
.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") // Remove control chars
.replace(/\t/g, " ") // Replace tabs with spaces
.replace(/\r\n/g, "\n") // Normalize line endings
.replace(/\r/g, "\n");
}
/**
* Recursively sanitizes all strings in an object.
*/
function sanitizeObject<T>(obj: T): T {
if (typeof obj === "string") {
return sanitizeString(obj) as T;
}
if (Array.isArray(obj)) {
return obj.map(sanitizeObject) as T;
}
if (obj && typeof obj === "object") {
const result: Record<string, unknown> = {};
for (const [key, value] of Object.entries(obj)) {
result[key] = sanitizeObject(value);
}
return result as T;
}
return obj;
}
/**
* Truncates DocContent to fit within size limits.
*/
function truncateDocContent(doc: DocContent, maxFullTextLength: number = 20000): DocContent {
return {
...doc,
fullText: doc.fullText.substring(0, maxFullTextLength),
sections: doc.sections.map(s => ({
...s,
content: s.content.substring(0, 2000), // Limit section content
})),
codeExamples: doc.codeExamples.slice(0, 5).map(e => ({
...e,
code: e.code.substring(0, 1500), // Limit code examples
})),
};
}
/**
* Formats DocContent as JSON.
*/
export function formatAsJson(doc: DocContent): string {
const truncated = truncateDocContent(doc);
const sanitized = sanitizeObject(truncated);
return JSON.stringify(sanitized, null, 2);
}