openapi.yml•22 kB
openapi: 3.1.0
info:
title: WebScraping.AI
contact:
name: WebScraping.AI Support
url: https://webscraping.ai
email: support@webscraping.ai
version: 3.2.0
description: WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
tags:
- name: AI
description: Analyze web pages using LLMs
- name: HTML
description: Get full HTML content of pages using proxies and Chromium JS rendering
- name: Text
description: Get visible text of pages using proxies and Chromium JS rendering
- name: Selected HTML
description: Get HTML content of selected page areas (like price, search results, page title, etc.)
- name: Account
description: Information about your account API credits quota
paths:
/ai/question:
get:
summary: Get an answer to a question about a given web page
description: Returns the answer in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing, then the answer is extracted using an LLM model.
operationId: getQuestion
tags: [ "AI" ]
parameters:
- $ref: '#/components/parameters/url'
- $ref: '#/components/parameters/question'
- $ref: '#/components/parameters/headers'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/js'
- $ref: '#/components/parameters/js_timeout'
- $ref: '#/components/parameters/wait_for'
- $ref: '#/components/parameters/proxy'
- $ref: '#/components/parameters/country'
- $ref: '#/components/parameters/custom_proxy'
- $ref: '#/components/parameters/device'
- $ref: '#/components/parameters/error_on_404'
- $ref: '#/components/parameters/error_on_redirect'
- $ref: '#/components/parameters/js_script'
- $ref: '#/components/parameters/format'
responses:
400:
$ref: '#/components/responses/400'
402:
$ref: '#/components/responses/402'
403:
$ref: '#/components/responses/403'
429:
$ref: '#/components/responses/429'
500:
$ref: '#/components/responses/500'
504:
$ref: '#/components/responses/504'
200:
description: Success
content:
text/html:
schema:
type: string
example: "Some answer"
/ai/fields:
get:
summary: Extract structured data fields from a web page
description: Returns structured data fields extracted from the webpage using an LLM model. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
operationId: getFields
tags: [ "AI" ]
parameters:
- $ref: '#/components/parameters/url'
- in: query
name: fields
description: Object describing fields to extract from the page and their descriptions
required: true
example: {"title":"Main product title","price":"Current product price","description":"Full product description"}
schema:
type: object
additionalProperties:
type: string
style: deepObject
explode: true
- $ref: '#/components/parameters/headers'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/js'
- $ref: '#/components/parameters/js_timeout'
- $ref: '#/components/parameters/wait_for'
- $ref: '#/components/parameters/proxy'
- $ref: '#/components/parameters/country'
- $ref: '#/components/parameters/custom_proxy'
- $ref: '#/components/parameters/device'
- $ref: '#/components/parameters/error_on_404'
- $ref: '#/components/parameters/error_on_redirect'
- $ref: '#/components/parameters/js_script'
responses:
400:
$ref: '#/components/responses/400'
402:
$ref: '#/components/responses/402'
403:
$ref: '#/components/responses/403'
429:
$ref: '#/components/responses/429'
500:
$ref: '#/components/responses/500'
504:
$ref: '#/components/responses/504'
200:
description: Success
content:
application/json:
schema:
type: object
additionalProperties:
type: string
example:
title: "Example Product"
price: "$99.99"
description: "This is a sample product description"
/html:
get:
summary: Page HTML by URL
description: Returns the full HTML content of a webpage specified by the URL. The response is in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
operationId: getHTML
tags: ["HTML"]
parameters:
- $ref: '#/components/parameters/url'
- $ref: '#/components/parameters/headers'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/js'
- $ref: '#/components/parameters/js_timeout'
- $ref: '#/components/parameters/wait_for'
- $ref: '#/components/parameters/proxy'
- $ref: '#/components/parameters/country'
- $ref: '#/components/parameters/custom_proxy'
- $ref: '#/components/parameters/device'
- $ref: '#/components/parameters/error_on_404'
- $ref: '#/components/parameters/error_on_redirect'
- $ref: '#/components/parameters/js_script'
- $ref: '#/components/parameters/return_script_result'
- $ref: '#/components/parameters/format'
responses:
400:
$ref: '#/components/responses/400'
402:
$ref: '#/components/responses/402'
403:
$ref: '#/components/responses/403'
429:
$ref: '#/components/responses/429'
500:
$ref: '#/components/responses/500'
504:
$ref: '#/components/responses/504'
200:
description: Success
content:
text/html:
schema:
type: string
example: "<html><head>\n <title>Example Domain</title>\n</head>\n\n<body>\n<div>\n <h1>Example Domain</h1>\n</body></html>"
/text:
get:
summary: Page text by URL
description: Returns the visible text content of a webpage specified by the URL. Can be used to feed data to LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
operationId: getText
tags: [ "Text" ]
parameters:
- $ref: '#/components/parameters/text_format'
- $ref: '#/components/parameters/return_links'
- $ref: '#/components/parameters/url'
- $ref: '#/components/parameters/headers'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/js'
- $ref: '#/components/parameters/js_timeout'
- $ref: '#/components/parameters/wait_for'
- $ref: '#/components/parameters/proxy'
- $ref: '#/components/parameters/country'
- $ref: '#/components/parameters/custom_proxy'
- $ref: '#/components/parameters/device'
- $ref: '#/components/parameters/error_on_404'
- $ref: '#/components/parameters/error_on_redirect'
- $ref: '#/components/parameters/js_script'
responses:
400:
$ref: '#/components/responses/400'
402:
$ref: '#/components/responses/402'
403:
$ref: '#/components/responses/403'
429:
$ref: '#/components/responses/429'
500:
$ref: '#/components/responses/500'
504:
$ref: '#/components/responses/504'
200:
description: Success
content:
text/html:
schema:
type: string
example: "Some content"
text/xml:
schema:
type: string
example: "<title>Some title</title>\n<description>Some description</description>\n<content>Some content</content>"
application/json:
schema:
type: string
example: '{"title":"Some title","description":"Some description","content":"Some content"}'
/selected:
get:
summary: HTML of a selected page area by URL and CSS selector
description: Returns HTML of a selected page area by URL and CSS selector. Useful if you don't want to do the HTML parsing on your side.
operationId: getSelected
tags: ["Selected HTML"]
parameters:
- in: query
name: selector
description: CSS selector (null by default, returns whole page HTML)
example: "h1"
schema:
type: string
- $ref: '#/components/parameters/url'
- $ref: '#/components/parameters/headers'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/js'
- $ref: '#/components/parameters/js_timeout'
- $ref: '#/components/parameters/wait_for'
- $ref: '#/components/parameters/proxy'
- $ref: '#/components/parameters/country'
- $ref: '#/components/parameters/custom_proxy'
- $ref: '#/components/parameters/device'
- $ref: '#/components/parameters/error_on_404'
- $ref: '#/components/parameters/error_on_redirect'
- $ref: '#/components/parameters/js_script'
- $ref: '#/components/parameters/format'
responses:
400:
$ref: '#/components/responses/400'
402:
$ref: '#/components/responses/402'
403:
$ref: '#/components/responses/403'
429:
$ref: '#/components/responses/429'
500:
$ref: '#/components/responses/500'
504:
$ref: '#/components/responses/504'
200:
description: Success
content:
text/html:
schema:
type: string
example: "<a href=\"https://www.iana.org/domains/example\">More information...</a>"
/selected-multiple:
get:
summary: HTML of multiple page areas by URL and CSS selectors
description: Returns HTML of multiple page areas by URL and CSS selectors. Useful if you don't want to do the HTML parsing on your side.
operationId: getSelectedMultiple
tags: ["Selected HTML"]
parameters:
- in: query
name: selectors
description: Multiple CSS selectors (null by default, returns whole page HTML)
example: ["h1"]
schema:
type: array
items:
type: string
style: form
explode: true
- $ref: '#/components/parameters/url'
- $ref: '#/components/parameters/headers'
- $ref: '#/components/parameters/timeout'
- $ref: '#/components/parameters/js'
- $ref: '#/components/parameters/js_timeout'
- $ref: '#/components/parameters/wait_for'
- $ref: '#/components/parameters/proxy'
- $ref: '#/components/parameters/country'
- $ref: '#/components/parameters/custom_proxy'
- $ref: '#/components/parameters/device'
- $ref: '#/components/parameters/error_on_404'
- $ref: '#/components/parameters/error_on_redirect'
- $ref: '#/components/parameters/js_script'
responses:
400:
$ref: '#/components/responses/400'
402:
$ref: '#/components/responses/402'
403:
$ref: '#/components/responses/403'
429:
$ref: '#/components/responses/429'
500:
$ref: '#/components/responses/500'
504:
$ref: '#/components/responses/504'
200:
description: Success
content:
application/json:
schema:
$ref: "#/components/schemas/SelectedAreas"
example: "[\"<a href='/test'>some link</a>\", \"Hello\"]"
/account:
get:
summary: Information about your account calls quota
description: Returns information about your account, including the remaining API credits quota, the next billing cycle start time, and the remaining concurrent requests. The response is in JSON format.
operationId: account
tags: [ "Account" ]
responses:
403:
$ref: '#/components/responses/403'
200:
description: Success
content:
application/json:
schema:
$ref: "#/components/schemas/Account"
example:
remaining_api_calls: 200000
resets_at: 1617073667
remaining_concurrency: 100
security:
- api_key: []
servers:
- url: https://api.webscraping.ai
components:
securitySchemes:
api_key:
type: apiKey
name: api_key
in: query
responses:
400:
description: Parameters validation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
example:
{
"message": "Invalid CSS selector"
}
402:
description: Billing issue, probably you've ran out of credits
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
example:
{
message: "Some error"
}
403:
description: Wrong API key
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
example:
{
message: "Some error"
}
429:
description: Too many concurrent requests
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
example:
{
message: "Some error"
}
500:
description: Non-2xx and non-404 HTTP status code on the target page or unexpected error, try again or contact support@webscraping.ai
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
example:
{
"message": "Unexpected HTTP code on the target page",
"status_code": 500,
"status_message": "Some website error",
}
504:
description: Timeout error, try increasing timeout parameter value
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
example:
{
message: "Some error"
}
parameters:
## Shared everywhere
url:
in: query
name: url
description: URL of the target page.
required: true
example: "https://example.com"
schema:
type: string
postUrl:
in: query
name: url
description: URL of the target page.
required: true
example: "https://httpbin.org/post"
schema:
type: string
headers:
in: query
name: headers
description: "HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"})."
example: '{"Cookie":"session=some_id"}'
schema:
type: object
additionalProperties:
type: string
style: deepObject
explode: true
timeout:
in: query
name: timeout
description: Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
example: 10000
schema:
type: integer
default: 10000
minimum: 1
maximum: 30000
js:
in: query
name: js
description: Execute on-page JavaScript using a headless browser (true by default).
example: true
schema:
type: boolean
default: true
js_timeout:
in: query
name: js_timeout
description: Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
example: 2000
schema:
type: integer
default: 2000
minimum: 1
maximum: 20000
wait_for:
in: query
name: wait_for
description: CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
schema:
type: string
proxy:
in: query
name: proxy
description: Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
example: "datacenter"
schema:
type: string
default: "datacenter"
enum: [ "datacenter", "residential" ]
country:
in: query
name: country
description: Country of the proxy to use (US by default).
example: "us"
schema:
type: string
default: "us"
enum: [ "us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in" ]
custom_proxy:
in: query
name: custom_proxy
description: Your own proxy URL to use instead of our built-in proxy pool in "http://user:password@host:port" format (<a target="_blank" href="https://webscraping.ai/proxies/smartproxy">Smartproxy</a> for example).
example:
schema:
type: string
device:
in: query
name: device
description: Type of device emulation.
example: "desktop"
schema:
type: string
default: "desktop"
enum: [ "desktop", "mobile", "tablet" ]
error_on_404:
in: query
name: error_on_404
description: Return error on 404 HTTP status on the target page (false by default).
example: false
schema:
type: boolean
default: false
error_on_redirect:
in: query
name: error_on_redirect
description: Return error on redirect on the target page (false by default).
example: false
schema:
type: boolean
default: false
js_script:
in: query
name: js_script
description: Custom JavaScript code to execute on the target page.
example: "document.querySelector('button').click();"
schema:
type: string
return_script_result:
in: query
name: return_script_result
description: Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned).
example: false
schema:
type: boolean
default: false
text_format:
in: query
name: text_format
description: Format of the text response (plain by default). "plain" will return only the page body text. "json" and "xml" will return a json/xml with "title", "description" and "content" keys.
example: "plain"
schema:
type: string
default: "plain"
enum: [ "plain", "xml", "json" ]
return_links:
in: query
name: return_links
description: "[Works only with text_format=json] Return links from the page body text (false by default). Useful for building web crawlers."
example: false
schema:
type: boolean
default: false
question:
in: query
name: question
description: Question or instructions to ask the LLM model about the target page.
example: "What is the summary of this page content?"
schema:
type: string
format:
in: query
name: format
description: Format of the response (text by default). "json" will return a JSON object with the response, "text" will return a plain text/HTML response.
example: "json"
schema:
type: string
default: "json"
enum: [ "json", "text" ]
requestBodies:
Body:
description: Request body to pass to the target page
content:
application/json:
schema:
type: object
additionalProperties: true
application/x-www-form-urlencoded:
schema:
type: object
additionalProperties: true
application/xml:
schema:
type: object
additionalProperties: true
text/plain:
schema:
type: string
schemas:
Error:
title: Generic error
type: object
properties:
message:
type: string
description: Error description
status_code:
type: integer
description: Target page response HTTP status code (403, 500, etc)
status_message:
type: string
description: Target page response HTTP status message
body:
type: string
description: Target page response body
SelectedAreas:
title: HTML for selected page areas
type: array
description: Array of elements matched by selectors
items:
type: string
Account:
title: Account limits info
type: object
properties:
email:
type: string
description: Your account email
remaining_api_calls:
type: integer
description: Remaining API credits quota
resets_at:
type: integer
description: Next billing cycle start time (UNIX timestamp)
remaining_concurrency:
type: integer
description: Remaining concurrent requests