#!/usr/bin/env python3
"""
YouTube Search Tool
This script searches YouTube and returns structured video results.
Features:
- Search YouTube with any query
- Returns video metadata for each result
- Supports multiple languages
- Configurable max results
- Handles pagination automatically
Usage:
python youtube_search.py "QUERY" [--lang LANG] [--max-results N]
Output:
JSON array of search results with:
- video_id, video_title, duration
- view_count, published_date
- channel_id, channel_name, channel_link
Example:
$ python youtube_search.py "machine learning tutorial" --max-results 10
[{"video_id": "abc", "video_title": "ML Basics", ...}, ...]
$ python youtube_search.py "Andrej Karpathy" --lang en --max-results 5
[{"video_id": "xyz", "channel_name": "Andrej Karpathy", ...}, ...]
"""
import argparse
import json
import logging
import sys
import time
import httpx
import ua_generator
from pydantic import BaseModel
LOGGER = logging.getLogger(__name__)
# Constants
REQUEST_TIMEOUT = 30 # Request timeout in seconds for search API
INITIAL_RETRIES = 2 # Initial retry count for search operations
RETRY_DELAY_FIXED = 3 # Fixed delay for retry operations
MAX_ITERATIONS = 30 # Maximum iterations for pagination
class YTSearchResult(BaseModel):
"""YouTube search result model."""
video_id: str
video_title: str
published_date: str
duration: str
view_count: str
channel_name: str
channel_id: str
channel_link: str
def __eq__(self, value):
if not isinstance(value, YTSearchResult):
return False
return self.channel_id == value.channel_id
def __hash__(self):
return hash(self.channel_id)
def extract_serp_continuation_token(response: dict) -> str:
"""Extract continuation token for search result pagination.
Handles both initial search response and continuation response formats.
Args:
response: YouTube search API JSON response
Returns:
Continuation token string for next page
Raises:
ValueError: Token not found in expected location
AttributeError: Unrecognized response structure
"""
try:
if "contents" in response:
# Initial search response format
# fmt: off
token = response["contents"]["twoColumnSearchResultsRenderer"]\
["primaryContents"]["sectionListRenderer"]["contents"][-1]\
["continuationItemRenderer"]["continuationEndpoint"]\
["continuationCommand"]["token"]
# fmt: on
LOGGER.debug("Extracted Continuation Token")
return token
if "onResponseReceivedCommands" in response:
# Continuation response format
# fmt: off
token = response["onResponseReceivedCommands"][0]["appendContinuationItemsAction"]\
["continuationItems"][-1]["continuationItemRenderer"]\
["continuationEndpoint"]["continuationCommand"]["token"]
# fmt: on
LOGGER.debug("Extracted Continuation Token")
return token
LOGGER.warning(
f"Unknown serp response when extracting continuation token: {', '.join(response.keys())}"
)
raise AttributeError("Unknown response from YT API")
except KeyError:
raise ValueError("No token extracted")
except Exception as error:
LOGGER.critical(
f"Unable to extract continuation token. Check if the API has changed: {error}. PAGE: {json.dumps(response)}"
)
raise
def parse_serp(response: dict) -> list[YTSearchResult]:
"""Parse YouTube search response and extract video results.
Args:
response: YouTube search API JSON response
Returns:
List of search result objects
Notes:
Skips non-video entries and malformed items
"""
if "contents" in response:
# Initial search response format
# fmt: off
videos = response["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]\
["sectionListRenderer"]["contents"][0]\
["itemSectionRenderer"]["contents"]
# fmt: on
elif "onResponseReceivedCommands" in response:
# Continuation response format
# fmt: off
videos = response["onResponseReceivedCommands"][0]["appendContinuationItemsAction"]\
["continuationItems"][0]["itemSectionRenderer"]["contents"]
# fmt: on
else:
LOGGER.error(
f"Unknown response format when extracting channels: {', '.join(response.keys())}"
)
raise ValueError("No channels found")
LOGGER.debug(f"Found {len(videos)} video containers for the current SERP")
channels: list[YTSearchResult] = []
for vid in videos:
try:
# Skip non-video entries
if "videoRenderer" not in vid.keys():
continue
# Extract video title
title_data = vid["videoRenderer"]["title"]
if "runs" in title_data:
title = title_data["runs"][0]["text"]
elif "accessibility" in title_data:
title = title_data["accessibility"]["accessibilityData"]["label"]
else:
continue
# Extract channel info
owner_data = vid["videoRenderer"]["ownerText"]
if "runs" not in owner_data:
continue
# Extract all video and channel metadata
# fmt: off
channel_name = owner_data["runs"][0]["text"]
channel_id = owner_data["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"]
channel_url = owner_data["runs"][0]["navigationEndpoint"]["browseEndpoint"]["canonicalBaseUrl"]
video_id = vid["videoRenderer"]["videoId"]
published_time = vid["videoRenderer"]["publishedTimeText"]["simpleText"]
video_length= vid["videoRenderer"]["lengthText"]["simpleText"]
view_count= vid["videoRenderer"]["viewCountText"]["simpleText"]
# fmt: on
channel = YTSearchResult(
video_id=video_id,
published_date=published_time,
duration=video_length,
view_count=view_count,
channel_name=channel_name,
channel_id=channel_id,
channel_link=channel_url,
video_title=title,
)
channels.append(channel)
except Exception as error:
if isinstance(error, KeyError) and "browseEndpoint" in str(error):
continue
LOGGER.error(
f"Something's wrong when extracting channel data: {error}. VIDEO: {json.dumps(vid)}"
)
LOGGER.debug(f"Extracted {len(channels)} channels")
return channels
def get_serp(
query: str,
langauge_code: str,
continuation_token: str | None = None,
session: httpx.Client | None = None,
) -> dict:
"""Execute YouTube search API request.
Args:
query: Search query string
langauge_code: Two-letter language code (e.g., 'en')
continuation_token: Optional token for pagination
session: Optional httpx client for connection reuse
Returns:
Search results JSON response
Raises:
httpx.HTTPStatusError: Request failed
httpx.RequestError: Network error
"""
url = "https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
# Build search request payload
payload = {
"context": {
"client": {
"hl": langauge_code,
"remoteHost": "112.134.244.81",
"deviceMake": "",
"deviceModel": "",
"userAgent": ua_generator.generate().text, # Randomize user agent
"clientName": "WEB",
"clientVersion": "2.20250304.01.00",
"osName": "X11",
"osVersion": "",
"originalUrl": f"https://www.youtube.com/results?search_query={query}",
"platform": "DESKTOP",
"clientFormFactor": "UNKNOWN_FORM_FACTOR",
"userInterfaceTheme": "USER_INTERFACE_THEME_DARK",
"browserName": "Firefox",
"browserVersion": "136.0",
"acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
"user": {"lockedSafetyMode": False},
"request": {
"useSsl": True,
"internalExperimentFlags": [],
"consistencyTokenJars": [],
},
},
"query": query,
}
if continuation_token:
payload["continuation"] = continuation_token
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0",
"Accept": "*/*",
"Accept-Language": "en-GB,en;q=0.7,en-US;q=0.3",
"Accept-Encoding": "gzip, deflate",
"Content-Type": "application/json",
"Referer": "https://www.youtube.com/results?search_query=data",
"Origin": "https://www.youtube.com",
"DNT": "1",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "same-origin",
"Sec-Fetch-Site": "same-origin",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"TE": "trailers",
}
if session:
response = session.post(
url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT
)
response.raise_for_status()
return response.json()
limits = httpx.Limits(max_keepalive_connections=100, max_connections=200)
timeout = httpx.Timeout(10.0, connect=10.0)
with httpx.Client(limits=limits, timeout=timeout) as local_session:
response = local_session.post(url, headers=headers, json=payload)
response.raise_for_status()
return response.json()
def handle_get_serp(
query: str,
langauge_code: str,
continuation_token: str | None = None,
session: httpx.Client | None = None,
) -> dict:
"""Execute search request with retry logic.
Args:
query: Search query string
langauge_code: Language code for localization
continuation_token: Optional pagination token
session: Optional httpx client
Returns:
Search results JSON response
Raises:
httpx.ReadTimeout, httpx.ConnectTimeout: Connection issues
ValueError: Unable to fetch after retries
"""
retires = 0
while retires < INITIAL_RETRIES:
try:
return get_serp(
query=query,
langauge_code=langauge_code,
continuation_token=continuation_token,
session=session,
)
except (httpx.ReadTimeout, httpx.ConnectTimeout):
raise
except httpx.HTTPStatusError as e:
LOGGER.error(
f"QUERY: {query}. HTTP ERROR {e.response.status_code}:{e.response.text}"
)
break
except Exception as e:
LOGGER.exception(f"QUERY: {query}. {e}", stacklevel=5)
time.sleep(RETRY_DELAY_FIXED)
retires += 1
raise ValueError("Unable to fetch SERP")
def handle_get_serps(
query: str, langauge_code: str = "en", max_results: int = 30
) -> list[YTSearchResult]:
"""Fetch paginated YouTube search results.
Args:
query: Search query string
langauge_code: Language code (default 'en')
max_results: Maximum results to return (default 30)
Returns:
List of search result objects, up to max_results
Notes:
Handles pagination automatically
Stops at max_results or when no more pages available
"""
LOGGER.debug(f"Processing '{query}' for {langauge_code}")
serp_items: list[YTSearchResult] = []
continuation_token: str | None = None
iterations = 0
with httpx.Client() as session:
while iterations < MAX_ITERATIONS:
if len(serp_items) > max_results:
break
iterations += 1
try:
serp = handle_get_serp(
query=query,
langauge_code=langauge_code,
continuation_token=continuation_token,
session=session,
)
page_results = parse_serp(serp)
if not page_results:
break
serp_items.extend(page_results)
continuation_token = extract_serp_continuation_token(serp)
except ValueError:
break
except (httpx.ReadTimeout, httpx.ConnectTimeout):
continue
except Exception:
LOGGER.exception(
f"QUERY: {query}. Error processing SERP: ", stacklevel=5
)
break
return serp_items[:max_results]
def main(argv: list | None = None) -> int:
"""CLI entrypoint for YouTube search.
Args:
argv: Optional command-line arguments
Returns:
Exit code (0 for success, 1 for error)
"""
parser = argparse.ArgumentParser(
description="Search YouTube and return structured results as JSON",
epilog="Examples:\n"
" %(prog)s 'machine learning tutorial' --max-results 10\n"
" %(prog)s 'Andrej Karpathy' --lang en\n"
" %(prog)s 'Veritasium physics' -n 50",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"query", help="Search query string (e.g., 'machine learning', 'Veritasium')"
)
parser.add_argument(
"--lang",
"-l",
dest="lang",
default="en",
help="Language code for results (e.g., en, es, fr, de) (default: en)",
)
parser.add_argument(
"--max-results",
"-n",
dest="max_results",
type=int,
default=30,
help="Maximum number of results to return (default: 30)",
)
args = parser.parse_args(argv)
try:
results = handle_get_serps(
args.query, langauge_code=args.lang, max_results=args.max_results
)
out = json.dumps(
[r.model_dump() for r in results], ensure_ascii=False, default=str
)
print(out, file=sys.stdout)
sys.exit(0)
except Exception as e:
print(f"Error fetching SERPs for query '{args.query}': {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()