Skip to main content
Glama
search.py14.8 kB
#!/usr/bin/env python3 """ YouTube Search Tool This script searches YouTube and returns structured video results. Features: - Search YouTube with any query - Returns video metadata for each result - Supports multiple languages - Configurable max results - Handles pagination automatically Usage: python youtube_search.py "QUERY" [--lang LANG] [--max-results N] Output: JSON array of search results with: - video_id, video_title, duration - view_count, published_date - channel_id, channel_name, channel_link Example: $ python youtube_search.py "machine learning tutorial" --max-results 10 [{"video_id": "abc", "video_title": "ML Basics", ...}, ...] $ python youtube_search.py "Andrej Karpathy" --lang en --max-results 5 [{"video_id": "xyz", "channel_name": "Andrej Karpathy", ...}, ...] """ import argparse import json import logging import sys import time import httpx import ua_generator from pydantic import BaseModel LOGGER = logging.getLogger(__name__) # Constants REQUEST_TIMEOUT = 30 # Request timeout in seconds for search API INITIAL_RETRIES = 2 # Initial retry count for search operations RETRY_DELAY_FIXED = 3 # Fixed delay for retry operations MAX_ITERATIONS = 30 # Maximum iterations for pagination class YTSearchResult(BaseModel): """YouTube search result model.""" video_id: str video_title: str published_date: str duration: str view_count: str channel_name: str channel_id: str channel_link: str def __eq__(self, value): if not isinstance(value, YTSearchResult): return False return self.channel_id == value.channel_id def __hash__(self): return hash(self.channel_id) def extract_serp_continuation_token(response: dict) -> str: """Extract continuation token for search result pagination. Handles both initial search response and continuation response formats. Args: response: YouTube search API JSON response Returns: Continuation token string for next page Raises: ValueError: Token not found in expected location AttributeError: Unrecognized response structure """ try: if "contents" in response: # Initial search response format # fmt: off token = response["contents"]["twoColumnSearchResultsRenderer"]\ ["primaryContents"]["sectionListRenderer"]["contents"][-1]\ ["continuationItemRenderer"]["continuationEndpoint"]\ ["continuationCommand"]["token"] # fmt: on LOGGER.debug("Extracted Continuation Token") return token if "onResponseReceivedCommands" in response: # Continuation response format # fmt: off token = response["onResponseReceivedCommands"][0]["appendContinuationItemsAction"]\ ["continuationItems"][-1]["continuationItemRenderer"]\ ["continuationEndpoint"]["continuationCommand"]["token"] # fmt: on LOGGER.debug("Extracted Continuation Token") return token LOGGER.warning( f"Unknown serp response when extracting continuation token: {', '.join(response.keys())}" ) raise AttributeError("Unknown response from YT API") except KeyError: raise ValueError("No token extracted") except Exception as error: LOGGER.critical( f"Unable to extract continuation token. Check if the API has changed: {error}. PAGE: {json.dumps(response)}" ) raise def parse_serp(response: dict) -> list[YTSearchResult]: """Parse YouTube search response and extract video results. Args: response: YouTube search API JSON response Returns: List of search result objects Notes: Skips non-video entries and malformed items """ if "contents" in response: # Initial search response format # fmt: off videos = response["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"]\ ["sectionListRenderer"]["contents"][0]\ ["itemSectionRenderer"]["contents"] # fmt: on elif "onResponseReceivedCommands" in response: # Continuation response format # fmt: off videos = response["onResponseReceivedCommands"][0]["appendContinuationItemsAction"]\ ["continuationItems"][0]["itemSectionRenderer"]["contents"] # fmt: on else: LOGGER.error( f"Unknown response format when extracting channels: {', '.join(response.keys())}" ) raise ValueError("No channels found") LOGGER.debug(f"Found {len(videos)} video containers for the current SERP") channels: list[YTSearchResult] = [] for vid in videos: try: # Skip non-video entries if "videoRenderer" not in vid.keys(): continue # Extract video title title_data = vid["videoRenderer"]["title"] if "runs" in title_data: title = title_data["runs"][0]["text"] elif "accessibility" in title_data: title = title_data["accessibility"]["accessibilityData"]["label"] else: continue # Extract channel info owner_data = vid["videoRenderer"]["ownerText"] if "runs" not in owner_data: continue # Extract all video and channel metadata # fmt: off channel_name = owner_data["runs"][0]["text"] channel_id = owner_data["runs"][0]["navigationEndpoint"]["browseEndpoint"]["browseId"] channel_url = owner_data["runs"][0]["navigationEndpoint"]["browseEndpoint"]["canonicalBaseUrl"] video_id = vid["videoRenderer"]["videoId"] published_time = vid["videoRenderer"]["publishedTimeText"]["simpleText"] video_length= vid["videoRenderer"]["lengthText"]["simpleText"] view_count= vid["videoRenderer"]["viewCountText"]["simpleText"] # fmt: on channel = YTSearchResult( video_id=video_id, published_date=published_time, duration=video_length, view_count=view_count, channel_name=channel_name, channel_id=channel_id, channel_link=channel_url, video_title=title, ) channels.append(channel) except Exception as error: if isinstance(error, KeyError) and "browseEndpoint" in str(error): continue LOGGER.error( f"Something's wrong when extracting channel data: {error}. VIDEO: {json.dumps(vid)}" ) LOGGER.debug(f"Extracted {len(channels)} channels") return channels def get_serp( query: str, langauge_code: str, continuation_token: str | None = None, session: httpx.Client | None = None, ) -> dict: """Execute YouTube search API request. Args: query: Search query string langauge_code: Two-letter language code (e.g., 'en') continuation_token: Optional token for pagination session: Optional httpx client for connection reuse Returns: Search results JSON response Raises: httpx.HTTPStatusError: Request failed httpx.RequestError: Network error """ url = "https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false" # Build search request payload payload = { "context": { "client": { "hl": langauge_code, "remoteHost": "112.134.244.81", "deviceMake": "", "deviceModel": "", "userAgent": ua_generator.generate().text, # Randomize user agent "clientName": "WEB", "clientVersion": "2.20250304.01.00", "osName": "X11", "osVersion": "", "originalUrl": f"https://www.youtube.com/results?search_query={query}", "platform": "DESKTOP", "clientFormFactor": "UNKNOWN_FORM_FACTOR", "userInterfaceTheme": "USER_INTERFACE_THEME_DARK", "browserName": "Firefox", "browserVersion": "136.0", "acceptHeader": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", }, "user": {"lockedSafetyMode": False}, "request": { "useSsl": True, "internalExperimentFlags": [], "consistencyTokenJars": [], }, }, "query": query, } if continuation_token: payload["continuation"] = continuation_token headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0", "Accept": "*/*", "Accept-Language": "en-GB,en;q=0.7,en-US;q=0.3", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/json", "Referer": "https://www.youtube.com/results?search_query=data", "Origin": "https://www.youtube.com", "DNT": "1", "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "same-origin", "Sec-Fetch-Site": "same-origin", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "TE": "trailers", } if session: response = session.post( url, headers=headers, json=payload, timeout=REQUEST_TIMEOUT ) response.raise_for_status() return response.json() limits = httpx.Limits(max_keepalive_connections=100, max_connections=200) timeout = httpx.Timeout(10.0, connect=10.0) with httpx.Client(limits=limits, timeout=timeout) as local_session: response = local_session.post(url, headers=headers, json=payload) response.raise_for_status() return response.json() def handle_get_serp( query: str, langauge_code: str, continuation_token: str | None = None, session: httpx.Client | None = None, ) -> dict: """Execute search request with retry logic. Args: query: Search query string langauge_code: Language code for localization continuation_token: Optional pagination token session: Optional httpx client Returns: Search results JSON response Raises: httpx.ReadTimeout, httpx.ConnectTimeout: Connection issues ValueError: Unable to fetch after retries """ retires = 0 while retires < INITIAL_RETRIES: try: return get_serp( query=query, langauge_code=langauge_code, continuation_token=continuation_token, session=session, ) except (httpx.ReadTimeout, httpx.ConnectTimeout): raise except httpx.HTTPStatusError as e: LOGGER.error( f"QUERY: {query}. HTTP ERROR {e.response.status_code}:{e.response.text}" ) break except Exception as e: LOGGER.exception(f"QUERY: {query}. {e}", stacklevel=5) time.sleep(RETRY_DELAY_FIXED) retires += 1 raise ValueError("Unable to fetch SERP") def handle_get_serps( query: str, langauge_code: str = "en", max_results: int = 30 ) -> list[YTSearchResult]: """Fetch paginated YouTube search results. Args: query: Search query string langauge_code: Language code (default 'en') max_results: Maximum results to return (default 30) Returns: List of search result objects, up to max_results Notes: Handles pagination automatically Stops at max_results or when no more pages available """ LOGGER.debug(f"Processing '{query}' for {langauge_code}") serp_items: list[YTSearchResult] = [] continuation_token: str | None = None iterations = 0 with httpx.Client() as session: while iterations < MAX_ITERATIONS: if len(serp_items) > max_results: break iterations += 1 try: serp = handle_get_serp( query=query, langauge_code=langauge_code, continuation_token=continuation_token, session=session, ) page_results = parse_serp(serp) if not page_results: break serp_items.extend(page_results) continuation_token = extract_serp_continuation_token(serp) except ValueError: break except (httpx.ReadTimeout, httpx.ConnectTimeout): continue except Exception: LOGGER.exception( f"QUERY: {query}. Error processing SERP: ", stacklevel=5 ) break return serp_items[:max_results] def main(argv: list | None = None) -> int: """CLI entrypoint for YouTube search. Args: argv: Optional command-line arguments Returns: Exit code (0 for success, 1 for error) """ parser = argparse.ArgumentParser( description="Search YouTube and return structured results as JSON", epilog="Examples:\n" " %(prog)s 'machine learning tutorial' --max-results 10\n" " %(prog)s 'Andrej Karpathy' --lang en\n" " %(prog)s 'Veritasium physics' -n 50", formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( "query", help="Search query string (e.g., 'machine learning', 'Veritasium')" ) parser.add_argument( "--lang", "-l", dest="lang", default="en", help="Language code for results (e.g., en, es, fr, de) (default: en)", ) parser.add_argument( "--max-results", "-n", dest="max_results", type=int, default=30, help="Maximum number of results to return (default: 30)", ) args = parser.parse_args(argv) try: results = handle_get_serps( args.query, langauge_code=args.lang, max_results=args.max_results ) out = json.dumps( [r.model_dump() for r in results], ensure_ascii=False, default=str ) print(out, file=sys.stdout) sys.exit(0) except Exception as e: print(f"Error fetching SERPs for query '{args.query}': {e}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/socialnetwork0/youtube-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server