import playwright from "@playwright/test";
import { ServiceMetadata } from "@superglue/shared";
import axios from "axios";
import { afterEach, beforeEach, describe, expect, it, Mocked, vi } from "vitest";
import { server_defaults } from "../default.js";
import { DocumentationFetcher } from "./documentation-fetching.js";
import { PlaywrightFetchingStrategy } from "./strategies/index.js";
import { DocumentationSearch } from "./documentation-search.js";
// Mock playwright and axios
vi.mock("@playwright/test", async (importOriginal) => {
const original = (await importOriginal()) as any;
return {
...original, // Preserve other exports if any
default: {
chromium: {
launch: vi.fn(),
},
},
};
});
vi.mock("axios");
// Helper to create standard Playwright mocks
const createPlaywrightMocks = () => {
const mockPage = {
goto: vi.fn().mockResolvedValue(undefined),
waitForLoadState: vi.fn().mockResolvedValue(undefined),
waitForTimeout: vi.fn().mockResolvedValue(undefined),
addInitScript: vi.fn().mockResolvedValue(undefined),
content: vi.fn().mockResolvedValue(""),
evaluate: vi.fn().mockResolvedValue(undefined),
close: vi.fn().mockResolvedValue(undefined),
};
const mockContext = {
newPage: vi.fn().mockResolvedValue(mockPage),
close: vi.fn().mockResolvedValue(undefined),
};
const mockBrowser = {
newContext: vi.fn().mockResolvedValue(mockContext),
close: vi.fn().mockResolvedValue(undefined),
};
// Setup the browser launch mock with a type assertion
vi.mocked(playwright.chromium.launch).mockResolvedValue(
mockBrowser as unknown as playwright.Browser,
);
return { mockPage, mockContext, mockBrowser };
};
describe("Documentation Class", () => {
let mockPage: any;
let mockContext: any;
let mockBrowser: any;
let mockedAxios: Mocked<typeof axios>; // Use Mocked type
let metadata: ServiceMetadata = { orgId: "" };
beforeEach(() => {
// Reset all mocks
vi.clearAllMocks();
mockedAxios = axios as Mocked<typeof axios>; // Ensure axios is typed correctly
mockedAxios.get.mockReset(); // Reset mocks specifically
mockedAxios.post.mockReset();
// Set LLM_PROVIDER env var to prevent errors when accessing LanguageModel.contextLength
process.env.LLM_PROVIDER = "ANTHROPIC";
// Create standard mocks for Playwright
({ mockPage, mockContext, mockBrowser } = createPlaywrightMocks());
});
afterEach(async () => {
// Use the static closeBrowser from the strategy class
await PlaywrightFetchingStrategy.closeBrowser();
});
describe("fetchAndProcess", () => {
it("should fetch and convert HTML documentation via Playwright", async () => {
const htmlDoc = `
<html><body><h1>API Docs</h1><p>Details here.</p></body></html>
`;
mockPage.evaluate.mockResolvedValue({
html: htmlDoc,
textContent: "API Docs Details here.",
links: {},
});
// Mock sitemap requests to fail (404)
mockedAxios.get.mockRejectedValue(new Error("404"));
const docUrl = "https://api.example.com/docs";
const doc = new DocumentationFetcher(
{ documentationUrl: docUrl, urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
expect(mockBrowser.newContext).toHaveBeenCalledTimes(1);
expect(mockContext.newPage).toHaveBeenCalledTimes(1);
expect(mockPage.goto).toHaveBeenCalledWith(docUrl, {
timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT,
});
expect(mockPage.waitForLoadState).toHaveBeenCalledWith("domcontentloaded", {
timeout: server_defaults.DOCUMENTATION.TIMEOUTS.PLAYWRIGHT,
});
expect(mockPage.waitForTimeout).toHaveBeenCalledWith(1000);
expect(mockPage.evaluate).toHaveBeenCalledTimes(1); // Single evaluate for DOM manipulation and link extraction
expect(result).toContain("# API Docs");
expect(result).toContain("Details here.");
// Sitemap fetches are attempted
expect(mockedAxios.get).toHaveBeenCalled();
expect(mockedAxios.post).not.toHaveBeenCalled();
});
it("should return raw page content if not HTML, GraphQL, or OpenAPI", async () => {
const plainDoc = "Plain text documentation content.";
mockPage.evaluate.mockResolvedValue({ html: plainDoc, textContent: plainDoc, links: {} });
// Mock sitemap requests to fail
mockedAxios.get.mockRejectedValue(new Error("404"));
const doc = new DocumentationFetcher(
{ documentationUrl: "https://api.example.com/raw", urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
expect(mockPage.evaluate).toHaveBeenCalledTimes(1);
expect(result).toBe(plainDoc);
expect(mockedAxios.get).toHaveBeenCalled(); // Sitemap attempts
expect(mockedAxios.post).not.toHaveBeenCalled();
});
it("should attempt GraphQL introspection for likely GraphQL URLs", async () => {
const mockSchema = { __schema: { types: [{ name: "Query" }] } };
mockedAxios.post.mockResolvedValueOnce({ data: { data: mockSchema } });
const docUrl = "https://api.example.com/graphql";
const headers = { Auth: "key" };
const params = { p: "1" };
const doc = new DocumentationFetcher(
{
documentationUrl: docUrl,
urlHost: "https://api.example.com",
urlPath: "/graphql",
headers,
queryParams: params,
},
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(mockedAxios.post).toHaveBeenCalledWith(
docUrl,
expect.objectContaining({ operationName: "IntrospectionQuery" }),
{ headers, params, timeout: server_defaults.DOCUMENTATION.TIMEOUTS.AXIOS },
);
expect(result).toBe(JSON.stringify(mockSchema.__schema));
expect(playwright.chromium.launch).not.toHaveBeenCalled();
});
it("should fall back to Playwright fetch if GraphQL introspection fails", async () => {
const htmlDoc = `<html><body>GraphQL Maybe?</body></html>`;
mockedAxios.post.mockRejectedValueOnce(new Error("GraphQL Network Error")); // Simulate network failure
mockPage.evaluate.mockResolvedValue({
html: htmlDoc,
textContent: "GraphQL Maybe?",
links: {},
});
const docUrl = "https://api.example.com/graphql"; // Looks like GraphQL
const doc = new DocumentationFetcher(
{ documentationUrl: docUrl, urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Check GraphQL was attempted
expect(mockedAxios.post).toHaveBeenCalledWith(docUrl, expect.anything(), expect.anything());
// Check Playwright was used as fallback
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
expect(mockPage.evaluate).toHaveBeenCalledTimes(1);
// Check result is from Playwright fetch (processed HTML)
expect(result).toContain("GraphQL Maybe?");
});
it("should fall back to Playwright fetch if GraphQL returns errors", async () => {
const htmlDoc = `<html><body>GraphQL Maybe?</body></html>`;
mockedAxios.post.mockResolvedValueOnce({ data: { errors: [{ message: "Bad Query" }] } }); // Simulate GQL error response
mockPage.evaluate.mockResolvedValue({
html: htmlDoc,
textContent: "GraphQL Maybe?",
links: {},
});
const docUrl = "https://api.example.com/graphql"; // Looks like GraphQL
const doc = new DocumentationFetcher(
{ documentationUrl: docUrl, urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Check GraphQL was attempted
expect(mockedAxios.post).toHaveBeenCalledWith(docUrl, expect.anything(), expect.anything());
// Check Playwright was used as fallback
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
expect(mockPage.evaluate).toHaveBeenCalledTimes(1);
// Check result is from Playwright fetch (processed HTML)
expect(result).toContain("GraphQL Maybe?");
});
it("should extract and fetch relative OpenAPI URL found in HTML", async () => {
const openApiJson = { openapi: "3.0.1", info: { title: "My API" } };
const baseUrl = "https://base.example.com/docs";
// Mock Axios to return OpenAPI spec directly (simulating Axios strategy success)
mockedAxios.get.mockResolvedValue({ data: openApiJson });
const doc = new DocumentationFetcher(
{ documentationUrl: baseUrl, urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Verify result contains the OpenAPI spec (formatted with indentation)
expect(result).toContain('"openapi": "3.0.1"');
expect(result).toContain('"title": "My API"');
});
it("should handle page content being the OpenAPI spec directly (JSON)", async () => {
const openApiJsonString = JSON.stringify({ swagger: "2.0", info: { title: "Direct JSON" } });
mockPage.evaluate.mockResolvedValue({
html: openApiJsonString,
textContent: openApiJsonString,
links: {},
});
// Mock sitemap requests to fail
mockedAxios.get.mockRejectedValue(new Error("404"));
const docUrl = "https://api.example.com/openapi.json";
const doc = new DocumentationFetcher(
{ documentationUrl: docUrl, urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
expect(mockPage.evaluate).toHaveBeenCalledTimes(1);
expect(result).toContain(openApiJsonString);
});
it("should handle page content being the OpenAPI spec directly (YAML)", async () => {
const openApiYaml = `openapi: 3.1.0\ninfo:\n title: Direct YAML`;
mockPage.evaluate.mockResolvedValue({
html: openApiYaml,
textContent: openApiYaml,
links: {},
});
// Mock sitemap requests to fail
mockedAxios.get.mockRejectedValue(new Error("404"));
const docUrl = "https://api.example.com/openapi.yaml";
const doc = new DocumentationFetcher(
{ documentationUrl: docUrl, urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
expect(mockPage.evaluate).toHaveBeenCalledTimes(1);
expect(result).toBe(openApiYaml);
});
it("should fall back to HTML->Markdown if OpenAPI extraction/fetch fails", async () => {
const swaggerHtml = `<html><script id="swagger-settings">{ "url": "/missing.json" }</script><body>Content</body></html>`;
mockPage.evaluate.mockResolvedValue({ html: swaggerHtml, textContent: "Content", links: {} });
// All requests fail
mockedAxios.get.mockRejectedValue(new Error("404 Not Found"));
const headers = { Auth: "key" };
const docUrl = "https://api.example.com/docs";
const doc = new DocumentationFetcher(
{ documentationUrl: docUrl, urlHost: "https://api.example.com", headers },
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(playwright.chromium.launch).toHaveBeenCalledTimes(1);
// Result should be the Markdown conversion of the original HTML
expect(result).toContain("Content");
expect(result).not.toContain("missing.json");
});
it("should handle Playwright fetch errors gracefully", async () => {
vi.mocked(playwright.chromium.launch).mockRejectedValueOnce(
new Error("Browser launch failed"),
);
const doc = new DocumentationFetcher(
{ documentationUrl: "https://api.example.com/docs", urlHost: "https://api.example.com" },
{},
metadata,
);
const result = await doc.fetchAndProcess();
expect(result).toBe(""); // Should return empty string on complete failure
expect(mockedAxios.get).toHaveBeenCalled(); // should call axios instead
});
it("should cache the result and return processed result on subsequent calls", async () => {
// Test with a simple text response via Axios
const plainDoc = "Plain text data";
// Mock Axios to return plain text (first strategy to succeed)
mockedAxios.get.mockResolvedValue({ data: plainDoc });
const httpDoc = new DocumentationFetcher(
{ documentationUrl: "http://example.com/docs.txt" },
{},
metadata,
);
const resHttp1 = await httpDoc.fetchAndProcess();
expect(resHttp1).toBe(plainDoc);
// Reset the call count for the second call to test caching
const initialCallCount = mockedAxios.get.mock.calls.length;
const resHttp2 = await httpDoc.fetchAndProcess();
expect(resHttp2).toBe(plainDoc);
expect(mockedAxios.get.mock.calls.length).toBe(initialCallCount); // No additional calls (cached)
});
});
describe("extractRelevantSections", () => {
const documentationSearch = new DocumentationSearch({ orgId: "test" });
it("should return empty string for empty documentation", () => {
const result = documentationSearch.extractRelevantSections("", "some instruction");
expect(result).toBe("");
});
it("should return whole doc if no valid search terms but doc is small", () => {
const doc = "Some documentation content here";
const result = documentationSearch.extractRelevantSections(doc, "a b c"); // All terms too short
expect(result).toBe(doc); // Returns whole doc since it's smaller than section size
});
it("should return whole doc if smaller than section size", () => {
const doc = "Short documentation";
const result = documentationSearch.extractRelevantSections(doc, "documentation", 5, 500);
expect(result).toBe(doc);
});
it("should return empty string if no sections match search terms", () => {
const doc = "A".repeat(1000);
const result = documentationSearch.extractRelevantSections(doc, "nonexistent term", 5, 200);
expect(result).toBe("");
});
it("should extract sections matching search terms", () => {
const doc = "prefix ".repeat(50) + "important api endpoint here " + "suffix ".repeat(50);
const result = documentationSearch.extractRelevantSections(doc, "api endpoint", 3, 200);
expect(result).toContain("api");
expect(result).toContain("endpoint");
expect(result.length).toBeLessThanOrEqual(3 * 200);
});
it("should respect maxSections parameter", () => {
const section1 = "first section with keyword api " + "x".repeat(170);
const section2 = "second section with keyword api " + "y".repeat(170);
const section3 = "third section with keyword api " + "z".repeat(170);
const doc = section1 + section2 + section3;
const result = documentationSearch.extractRelevantSections(doc, "api", 2, 200);
const sections = result.split("\n\n");
expect(sections.length).toBeLessThanOrEqual(2);
expect(result.length).toBeLessThanOrEqual(2 * 200);
});
it("should respect sectionSize parameter", () => {
const doc = "test api ".repeat(100); // ~900 chars
const result = documentationSearch.extractRelevantSections(doc, "api test", 3, 250);
// Should create sections of 250 chars each
expect(result.length).toBeLessThanOrEqual(3 * 250);
expect(result).toContain("api");
expect(result).toContain("test");
});
it("should handle multiple search terms and score accordingly", () => {
const section1 = "authentication and authorization required " + "x".repeat(160);
const section2 = "just some random content here " + "y".repeat(170);
const section3 = "authentication mentioned once " + "z".repeat(170);
const doc = section1 + section2 + section3;
const result = documentationSearch.extractRelevantSections(
doc,
"authentication authorization",
2,
200,
);
// Section 1 should score highest (has both terms)
// Section 3 should score second (has one term)
// Section 2 should not be included (has no terms)
expect(result).toContain("authentication");
expect(result).toContain("authorization");
expect(result).not.toContain("random content");
});
it("should maintain section order after scoring", () => {
const section1 = "first match for keyword " + "a".repeat(176);
const section2 = "no matches here at all " + "b".repeat(177);
const section3 = "third match for keyword " + "c".repeat(176);
const doc = section1 + section2 + section3;
const result = documentationSearch.extractRelevantSections(doc, "keyword", 2, 200);
// Both matching sections should be included in their original order
const firstIndex = result.indexOf("first");
const thirdIndex = result.indexOf("third");
expect(firstIndex).toBeLessThan(thirdIndex);
});
it("should validate and adjust input parameters", () => {
const doc = "test content ".repeat(100);
// Test with invalid maxSections (too high)
const result1 = documentationSearch.extractRelevantSections(doc, "test", 150, 200);
expect(result1).toContain("test");
// Test with invalid sectionSize (too small)
const result2 = documentationSearch.extractRelevantSections(doc, "test", 5, 50);
expect(result2).toContain("test");
// Test with 0 or negative values
const result3 = documentationSearch.extractRelevantSections(doc, "test", 0, -100);
expect(result3).toContain("test");
});
it("should filter search terms by minimum length", () => {
const doc = "authentication system for api access";
// "for" should be filtered out (too short)
const result = documentationSearch.extractRelevantSections(doc, "for api", 1, 200);
expect(result).toContain("api");
// Returns whole doc if all terms are too short and doc is small
const result2 = documentationSearch.extractRelevantSections(doc, "a or by", 1, 200);
expect(result2).toBe(doc); // Whole doc since it's smaller than section size
});
describe("OpenAPI schema integration", () => {
it("should extract security information when security keywords are present", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
components: {
securitySchemes: {
bearerAuth: {
type: "http",
scheme: "bearer",
bearerFormat: "JWT",
},
apiKey: {
type: "apiKey",
in: "header",
name: "X-API-Key",
},
},
},
security: [{ bearerAuth: [] }],
paths: {
"/users": {
get: {
summary: "Get users",
operationId: "getUsers",
responses: { "200": { description: "Success" } },
},
},
},
});
const doc = "General documentation content about the API usage.";
const result = documentationSearch.extractRelevantSections(
doc,
"authentication bearer token",
5,
2000,
openApiSpec,
);
expect(result).toContain("=== SECURITY ===");
expect(result).toContain("bearerAuth");
expect(result).toContain("bearer");
expect(result).toContain("JWT");
expect(result).toContain("apiKey");
expect(result).toContain("X-API-Key");
});
it("should not extract security info when no security keywords in query", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
components: {
securitySchemes: {
bearerAuth: { type: "http", scheme: "bearer" },
},
},
paths: {
"/users": {
get: {
summary: "Get users",
operationId: "getUsers",
tags: ["users"],
responses: { "200": { description: "Success" } },
},
},
},
});
const doc = "General documentation content.";
const result = documentationSearch.extractRelevantSections(
doc,
"users list",
5,
2000,
openApiSpec,
);
expect(result).not.toContain("=== SECURITY ===");
expect(result).not.toContain("bearerAuth");
});
it("should extract and rank relevant OpenAPI operations based on search terms", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
paths: {
"/users": {
get: {
summary: "List all users",
operationId: "listUsers",
tags: ["users"],
description: "Get a list of all users in the system",
responses: { "200": { description: "Success" } },
},
post: {
summary: "Create a user",
operationId: "createUser",
tags: ["users"],
description: "Create a new user account",
responses: { "201": { description: "Created" } },
},
},
"/products": {
get: {
summary: "List products",
operationId: "listProducts",
tags: ["products"],
description: "Get all products from catalog",
responses: { "200": { description: "Success" } },
},
},
"/users/{id}": {
get: {
summary: "Get user by ID",
operationId: "getUserById",
tags: ["users"],
description: "Fetch a single user by their unique identifier",
parameters: [
{ name: "id", in: "path", required: true, schema: { type: "string" } },
],
responses: { "200": { description: "Success" } },
},
},
},
});
const doc = "Some general documentation text.";
const result = documentationSearch.extractRelevantSections(
doc,
"users account identifier",
5,
2000,
openApiSpec,
);
expect(result).toContain("=== OPENAPI OPERATIONS ===");
expect(result).toContain("[GET /users]");
expect(result).toContain("listUsers");
// Should not include products endpoint since search terms only match user-related operations
expect(result).not.toContain("products");
expect(result).not.toContain("listProducts");
});
it("should match operations by path, method, operationId, and description", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
paths: {
"/auth/login": {
post: {
summary: "User login",
operationId: "loginUser",
tags: ["authentication"],
description: "Authenticate user with credentials and return token",
responses: { "200": { description: "Success" } },
},
},
"/auth/logout": {
post: {
summary: "User logout",
operationId: "logoutUser",
tags: ["authentication"],
description: "Invalidate user session token",
responses: { "200": { description: "Success" } },
},
},
"/users/profile": {
get: {
summary: "Get profile",
operationId: "getProfile",
tags: ["users"],
description: "Get current user profile",
responses: { "200": { description: "Success" } },
},
},
},
});
// Search by path component
const result1 = documentationSearch.extractRelevantSections(
"",
"auth login",
5,
2000,
openApiSpec,
);
expect(result1).toContain("[POST /auth/login]");
expect(result1).toContain("loginUser");
expect(result1).toContain("Authenticate user with credentials");
// Search by operationId
const result2 = documentationSearch.extractRelevantSections(
"",
"logoutUser",
5,
2000,
openApiSpec,
);
expect(result2).toContain("[POST /auth/logout]");
expect(result2).toContain("logoutUser");
// Search by tag
const result3 = documentationSearch.extractRelevantSections(
"",
"authentication",
5,
2000,
openApiSpec,
);
expect(result3).toContain("authentication");
expect(result3).toContain("login");
});
it("should limit number of returned operations based on maxSections", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
paths: {
"/users": {
get: {
summary: "List users",
operationId: "listUsers",
description: "Get all users in the system",
responses: { "200": { description: "Success" } },
},
post: {
summary: "Create user",
operationId: "createUser",
description: "Create a new user account",
responses: { "201": { description: "Created" } },
},
},
"/users/{id}": {
get: {
summary: "Get user",
operationId: "getUser",
description: "Retrieve user by ID",
responses: { "200": { description: "Success" } },
},
put: {
summary: "Update user",
operationId: "updateUser",
description: "Update user information",
responses: { "200": { description: "Success" } },
},
delete: {
summary: "Delete user",
operationId: "deleteUser",
description: "Remove user from system",
responses: { "204": { description: "Deleted" } },
},
},
},
});
// With maxSections=2, should only get top 2 matching operations
const result = documentationSearch.extractRelevantSections(
"",
"users",
2,
2000,
openApiSpec,
);
expect(result).toContain("=== OPENAPI OPERATIONS ===");
// Count operation delimiters to verify we got limited results
const operationCount = (result.match(/\[(?:GET|POST|PUT|DELETE) /g) || []).length;
expect(operationCount).toBeLessThanOrEqual(2);
});
it("should handle OpenAPI spec with parameters in operations", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
paths: {
"/search": {
get: {
summary: "Search items",
operationId: "searchItems",
description: "Search for items using query parameters",
parameters: [
{ name: "query", in: "query", required: true, schema: { type: "string" } },
{ name: "limit", in: "query", schema: { type: "integer" } },
{ name: "offset", in: "query", schema: { type: "integer" } },
],
responses: { "200": { description: "Success" } },
},
},
},
});
// Search should match parameter names
const result = documentationSearch.extractRelevantSections(
"",
"query limit search",
5,
2000,
openApiSpec,
);
expect(result).toContain("[GET /search]");
expect(result).toContain("searchItems");
expect(result).toContain("query");
expect(result).toContain("limit");
});
it("should combine documentation sections with OpenAPI operations", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
paths: {
"/users": {
get: {
summary: "Get users",
operationId: "getUsers",
description: "List all users",
responses: { "200": { description: "Success" } },
},
},
},
});
const doc =
"This documentation explains how to use the users endpoint. " +
"The users API allows you to manage user accounts. " +
"You can list, create, update, and delete users.";
const result = documentationSearch.extractRelevantSections(
doc,
"users",
5,
2000,
openApiSpec,
);
// Should contain both documentation sections and OpenAPI operations
// Note: === DOCUMENTATION === header is only added when security info is present
expect(result).toContain("users endpoint");
expect(result).toContain("manage user accounts");
expect(result).toContain("=== OPENAPI OPERATIONS ===");
expect(result).toContain("[GET /users]");
expect(result).toContain("getUsers");
});
it("should add DOCUMENTATION header when security info is also present", () => {
const openApiSpec = JSON.stringify({
openapi: "3.0.0",
info: { title: "Test API", version: "1.0.0" },
components: {
securitySchemes: {
bearerAuth: {
type: "http",
scheme: "bearer",
},
},
},
security: [{ bearerAuth: [] }],
paths: {
"/users": {
get: {
summary: "Get users",
operationId: "getUsers",
description: "List all users",
responses: { "200": { description: "Success" } },
},
},
},
});
const doc = "This documentation explains the API. Users endpoint allows managing accounts.";
const result = documentationSearch.extractRelevantSections(
doc,
"users authentication",
5,
2000,
openApiSpec,
);
// With security keywords, should have all three sections with headers
expect(result).toContain("=== SECURITY ===");
expect(result).toContain("bearerAuth");
expect(result).toContain("=== DOCUMENTATION ===");
expect(result).toContain("Users endpoint");
expect(result).toContain("=== OPENAPI OPERATIONS ===");
expect(result).toContain("[GET /users]");
});
it("should handle Google Discovery schema format", () => {
const googleDiscoverySpec = JSON.stringify({
kind: "discovery#restDescription",
name: "testapi",
version: "v1",
resources: {
users: {
methods: {
list: {
id: "testapi.users.list",
path: "users",
httpMethod: "GET",
description: "Lists all users in the system",
parameters: {
maxResults: {
type: "integer",
location: "query",
},
},
},
insert: {
id: "testapi.users.insert",
path: "users",
httpMethod: "POST",
description: "Creates a new user",
},
},
},
},
});
const result = documentationSearch.extractRelevantSections(
"",
"users list",
5,
2000,
googleDiscoverySpec,
);
expect(result).toContain("=== OPENAPI OPERATIONS ===");
expect(result).toContain("testapi.users.list");
expect(result).toContain("Lists all users");
expect(result).toContain("GET");
});
});
});
describe("Sitemap and URL Ranking", () => {
let strategy: PlaywrightFetchingStrategy;
beforeEach(() => {
strategy = new PlaywrightFetchingStrategy();
vi.clearAllMocks();
mockedAxios = axios as Mocked<typeof axios>;
mockedAxios.get.mockReset();
mockedAxios.post.mockReset();
({ mockPage, mockContext, mockBrowser } = createPlaywrightMocks());
});
describe("rankItems", () => {
it("should filter out URLs with excluded keywords", () => {
const urls = [
"https://api.com/docs/getting-started",
"https://api.com/pricing",
"https://api.com/docs/authentication",
"https://api.com/signup",
"https://api.com/blog/updates",
];
const keywords = ["docs", "authentication"];
const ranked = strategy.rankItems(urls, keywords);
// Should exclude pricing, signup, and blog completely
expect(ranked).toHaveLength(2);
expect(ranked[0]).toBe("https://api.com/docs/authentication");
expect(ranked[1]).toBe("https://api.com/docs/getting-started");
expect(ranked).not.toContain("https://api.com/pricing");
expect(ranked).not.toContain("https://api.com/signup");
expect(ranked).not.toContain("https://api.com/blog/updates");
});
it("should rank URLs by keyword match count divided by URL length", () => {
const urls = [
"https://example.com/v1/users/read/fast", // No 'api' in domain, 1 match
"https://api.com/documentation/api/v1/users/endpoints", // Long, 2 matches
"https://api.com/api/users", // Short, 2 matches
];
const keywords = ["api", "users"];
const ranked = strategy.rankItems(urls, keywords) as string[];
// api/users should rank highest (2 matches, shortest URL with api)
expect(ranked[0]).toBe("https://api.com/api/users");
// Long URL with 2 matches should be second
expect(ranked[1]).toBe("https://api.com/documentation/api/v1/users/endpoints");
// URL with only 1 match should be last
expect(ranked[2]).toBe("https://example.com/v1/users/read/fast");
});
it("should handle link objects with text", () => {
const links = [
{ linkText: "API Reference", href: "https://api.com/reference" },
{ linkText: "Getting Started", href: "https://api.com/start" },
{ linkText: "Pricing Plans", href: "https://api.com/pricing" },
];
const keywords = ["api", "reference"];
const ranked = strategy.rankItems(links, keywords);
expect(ranked).toHaveLength(2); // Pricing excluded completely
expect(ranked[0]).toEqual({ linkText: "API Reference", href: "https://api.com/reference" });
expect(ranked[1]).toEqual({ linkText: "Getting Started", href: "https://api.com/start" });
expect(ranked).not.toContainEqual({
linkText: "Pricing Plans",
href: "https://api.com/pricing",
});
});
it("should filter already fetched links when provided", () => {
const urls = [
"https://api.com/docs/intro",
"https://api.com/docs/api",
"https://api.com/docs/guide",
];
const fetchedLinks = new Set(["https://api.com/docs/intro"]);
const keywords = ["docs"];
const ranked = strategy.rankItems(urls, keywords, fetchedLinks) as string[];
expect(ranked).toHaveLength(2);
expect(ranked).not.toContain("https://api.com/docs/intro");
});
});
describe("Sitemap fetching", () => {
it("should fetch and parse XML sitemap", async () => {
const sitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://api.com/docs/intro</loc></url>
<url><loc>https://api.com/docs/auth</loc></url>
<url><loc>https://api.com/pricing</loc></url>
<url><loc>https://api.com/docs/api</loc></url>
</urlset>`;
// Mock sitemap fetch
mockedAxios.get.mockImplementation((url: string) => {
if (url.includes("sitemap.xml")) {
return Promise.resolve({ data: sitemapXml });
}
return Promise.reject(new Error("404"));
});
// Mock page fetches
mockPage.evaluate.mockResolvedValue({
html: "<html><body>Content</body></html>",
textContent: "Content",
links: {},
});
const doc = new DocumentationFetcher(
{
documentationUrl: "https://api.com/docs",
keywords: ["api", "auth"],
},
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Should fetch sitemap
expect(mockedAxios.get).toHaveBeenCalledWith(
expect.stringContaining("sitemap.xml"),
expect.any(Object),
);
// Should have fetched pages (excluding pricing due to excluded keywords)
expect(mockPage.goto).toHaveBeenCalled();
expect(result).toContain("Content");
});
it("should handle sitemap index with nested sitemaps", async () => {
const sitemapIndex = `<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap><loc>https://api.com/docs/sitemap.xml</loc></sitemap>
<sitemap><loc>https://api.com/blog/sitemap.xml</loc></sitemap>
</sitemapindex>`;
const docsSitemap = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://api.com/docs/intro</loc></url>
<url><loc>https://api.com/docs/api</loc></url>
</urlset>`;
mockedAxios.get.mockImplementation((url: string) => {
if (url.includes("sitemap_index.xml") || url === "https://api.com/sitemap.xml") {
return Promise.resolve({ data: sitemapIndex });
}
if (url.includes("docs/sitemap.xml")) {
return Promise.resolve({ data: docsSitemap });
}
return Promise.reject(new Error("404"));
});
mockPage.evaluate.mockResolvedValue({
html: "<html><body>Docs</body></html>",
textContent: "Docs",
links: {},
});
const doc = new DocumentationFetcher(
{
documentationUrl: "https://api.com/docs",
keywords: ["docs"],
},
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Should fetch main sitemap and docs sitemap (not blog due to filtering)
expect(mockedAxios.get).toHaveBeenCalledWith(
expect.stringContaining("sitemap"),
expect.any(Object),
);
expect(result).toContain("Docs");
});
it("should fall back to legacy crawling if no sitemap found", async () => {
// All sitemap requests fail
mockedAxios.get.mockRejectedValue(new Error("404"));
// Mock initial page with links
mockPage.evaluate.mockResolvedValueOnce({
html: "<html><body>Main Page</body></html>",
textContent: "Main Page",
links: {
"api reference https docs api": "https://api.com/docs/api",
"getting started https docs start": "https://api.com/docs/start",
},
});
const doc = new DocumentationFetcher(
{
documentationUrl: "https://api.com/docs",
keywords: ["api"],
},
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Should use legacy crawling
expect(mockPage.goto).toHaveBeenCalled();
expect(result).toContain("Main Page");
});
it("should respect MAX_FETCHED_LINKS limit", async () => {
// Create a sitemap with many URLs
const urls = Array.from(
{ length: 100 },
(_, i) => `<url><loc>https://api.com/docs/page${i}</loc></url>`,
).join("");
const sitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
${urls}
</urlset>`;
mockedAxios.get.mockImplementation((url: string) => {
if (url.includes("sitemap.xml")) {
return Promise.resolve({ data: sitemapXml });
}
return Promise.reject(new Error("404"));
});
mockPage.evaluate.mockResolvedValue({
html: "<html><body>Page</body></html>",
textContent: "Page",
links: {},
});
const doc = new DocumentationFetcher(
{
documentationUrl: "https://api.com/docs",
keywords: ["docs"],
},
{},
metadata,
);
await doc.fetchAndProcess();
// Should respect the limit (default is 10)
expect(mockPage.goto).toHaveBeenCalledTimes(
server_defaults.DOCUMENTATION.MAX_FETCHED_LINKS,
);
});
it("should filter sitemap URLs by path relevance", async () => {
const sitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://api.com/docs/api/intro</loc></url>
<url><loc>https://api.com/company/about</loc></url>
<url><loc>https://api.com/docs/api/auth</loc></url>
<url><loc>https://api.com/marketing/landing</loc></url>
</urlset>`;
mockedAxios.get.mockImplementation((url: string) => {
// Return sitemap for the first matching candidate
if (url.includes("sitemap.xml")) {
return Promise.resolve({ data: sitemapXml });
}
return Promise.reject(new Error("404"));
});
mockPage.evaluate.mockResolvedValue({
html: "<html><body>Content</body></html>",
textContent: "Content",
links: {},
});
const doc = new DocumentationFetcher(
{
documentationUrl: "https://api.com/docs/api",
keywords: ["intro", "auth"], // Keywords that match the URLs
},
{},
metadata,
);
await doc.fetchAndProcess();
// Should have fetched some pages
const calledUrls = mockPage.goto.mock.calls.map((call) => call[0]);
expect(calledUrls.length).toBeGreaterThan(0);
// Verify that URL filtering worked by checking the fetched URLs
// The implementation filters at collection time, so we should only see relevant URLs
expect(calledUrls.some((url) => url.includes("intro") || url.includes("auth"))).toBe(true);
});
it("should deduplicate similar page content based on similarity threshold", async () => {
const sitemapXml = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>https://api.com/docs/page1</loc></url>
<url><loc>https://api.com/docs/page2</loc></url>
<url><loc>https://api.com/docs/page3</loc></url>
<url><loc>https://api.com/docs/page4</loc></url>
</urlset>`;
mockedAxios.get.mockImplementation((url: string) => {
if (url.includes("sitemap.xml")) {
return Promise.resolve({ data: sitemapXml });
}
return Promise.reject(new Error("404"));
});
// Mock page content with duplicates
const uniqueContent1 =
"Authentication API documentation with bearer token support and OAuth flows for secure access " +
"x".repeat(500);
const duplicateContent =
"Authentication API documentation with bearer token support and OAuth flows for secure access " +
"x".repeat(500);
const uniqueContent2 =
"Completely different content about webhooks and event subscriptions for real-time updates " +
"y".repeat(500);
let callCount = 0;
mockPage.evaluate.mockImplementation(() => {
callCount++;
if (callCount === 1) {
return Promise.resolve({
html: `<html><body>${uniqueContent1}</body></html>`,
textContent: uniqueContent1,
links: {},
});
} else if (callCount === 2) {
return Promise.resolve({
html: `<html><body>${duplicateContent}</body></html>`,
textContent: duplicateContent,
links: {},
});
} else if (callCount === 3) {
return Promise.resolve({
html: `<html><body>${uniqueContent2}</body></html>`,
textContent: uniqueContent2,
links: {},
});
} else {
return Promise.resolve({
html: `<html><body>${duplicateContent}</body></html>`,
textContent: duplicateContent,
links: {},
});
}
});
const doc = new DocumentationFetcher(
{
documentationUrl: "https://api.com/docs",
keywords: ["api"],
},
{},
metadata,
);
const result = await doc.fetchAndProcess();
// Should have fetched multiple pages
expect(mockPage.goto).toHaveBeenCalled();
expect(callCount).toBeGreaterThan(1);
// Result should contain unique content
expect(result).toContain("Authentication API documentation");
expect(result).toContain("webhooks and event subscriptions");
// Count occurrences of the duplicate content - should only appear once
const occurrences = (
result.match(/Authentication API documentation with bearer token support/g) || []
).length;
expect(occurrences).toBe(1);
});
});
});
});