ingestWebsite
Crawl and ingest website content recursively with customizable depth, link limits, and path filters. Organize extracted data into chunks for efficient processing and integration into knowledge bases.
Instructions
Crawls and ingests content from a website recursively. Supports depth control and path filtering.
Input Schema
Name | Required | Description | Default |
---|---|---|---|
ingestConfig | Yes | ||
namespaceId | No | ||
tenantId | No |
Input Schema (JSON Schema)
{
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
"properties": {
"ingestConfig": {
"additionalProperties": false,
"properties": {
"chunkConfig": {
"additionalProperties": false,
"description": "Optional Chunk config. When not passed, default chunk config will be used.",
"properties": {
"chunkOverlap": {
"type": "number"
},
"chunkSize": {
"type": "number"
}
},
"required": [
"chunkSize",
"chunkOverlap"
],
"type": "object"
},
"config": {
"additionalProperties": false,
"properties": {
"excludePaths": {
"items": {
"type": "string"
},
"type": "array"
},
"includePaths": {
"items": {
"type": "string"
},
"type": "array"
},
"maxDepth": {
"type": "number"
},
"maxLinks": {
"type": "number"
},
"metadata": {
"additionalProperties": {
"anyOf": [
{
"type": "string"
},
{
"items": {
"type": "string"
},
"type": "array"
}
]
},
"type": "object"
},
"url": {
"type": "string"
}
},
"required": [
"url"
],
"type": "object"
},
"source": {
"const": "WEBSITE",
"type": "string"
}
},
"required": [
"source",
"config"
],
"type": "object"
},
"namespaceId": {
"type": "string"
},
"tenantId": {
"type": "string"
}
},
"required": [
"ingestConfig"
],
"type": "object"
}