download_pdf
Download PDF files from URLs with optional NCCN authentication for accessing clinical guidelines.
Instructions
Download a PDF file from the specified URL, with optional NCCN login credentials.
Args:
url: The URL of the PDF file to download
filename: Optional custom filename for the downloaded file
username: Optional NCCN username/email for authentication (defaults to NCCN_USERNAME env var)
password: Optional NCCN password for authentication (defaults to NCCN_PASSWORD env var)
Returns:
String indicating success/failure and the path to the downloaded file
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes |
Implementation Reference
- server.py:167-223 (handler)MCP tool handler for 'download_pdf': orchestrates authentication, directory setup, and delegates download to NCCNDownloader instance.@mcp.tool() async def download_pdf(url: str) -> str: """ Download a PDF file from the specified URL, with optional NCCN login credentials. Args: url: The URL of the PDF file to download filename: Optional custom filename for the downloaded file username: Optional NCCN username/email for authentication (defaults to NCCN_USERNAME env var) password: Optional NCCN password for authentication (defaults to NCCN_PASSWORD env var) Returns: String indicating success/failure and the path to the downloaded file """ try: # Ensure download directory exists download_path = current_dir / DOWNLOAD_DIR download_path.mkdir(exist_ok=True) # Use provided credentials or fall back to global configuration auth_username = NCCN_USERNAME auth_password = NCCN_PASSWORD # Create downloader instance with credentials if available if auth_username and auth_password: downloader_instance = NCCNDownloader(auth_username, auth_password) logger.info(f"Using NCCN authentication for user: {auth_username}") else: downloader_instance = downloader logger.info("No NCCN authentication configured - attempting anonymous download") # Download the PDF success, actual_filename = await downloader_instance.download_pdf( pdf_url=url, download_dir=str(download_path), username=auth_username, password=auth_password, skip_if_exists=True ) # Update the full path with the actual filename used actual_full_path = download_path / actual_filename if success: logger.info(f"PDF downloaded successfully: {actual_full_path}") return f"PDF downloaded successfully: {actual_full_path} (filename: {actual_filename})" else: error_msg = f"Failed to download PDF from {url} (attempted filename: {actual_filename})." if not (auth_username and auth_password): error_msg += " You may need to provide NCCN login credentials via environment variables (NCCN_USERNAME, NCCN_PASSWORD) or function parameters." logger.error(error_msg) return error_msg except Exception as e: logger.error(f"Error downloading PDF: {str(e)}") return f"Error downloading PDF: {str(e)}"
- nccn_login_downloader.py:204-340 (helper)Core implementation of PDF downloading in NCCNDownloader class, including caching check, HTTP requests, automatic login detection and handling, and recursive re-download after login.async def download_pdf(self, pdf_url, download_dir=None, username=None, password=None, skip_if_exists=True, max_cache_age_days=PDF_CACHE_MAX_AGE_DAYS): """ Downloads a PDF file, automatically logging in if required. Args: pdf_url (str): URL of the PDF file. download_dir (str, optional): Directory to save the PDF. Defaults to current directory. username (str, optional): Username (email address), required if not already logged in. password (str, optional): Password, required if not already logged in. skip_if_exists (bool): Whether to skip download if the file already exists. Defaults to True. max_cache_age_days (int): Maximum cache file validity period (days). Defaults to PDF_CACHE_MAX_AGE_DAYS. Returns: tuple: (success (bool), saved_filename (str)) """ try: # Automatically extract filename from URL filename = os.path.basename(pdf_url) if not filename or not filename.endswith('.pdf'): filename = 'nccn_guideline.pdf' if download_dir: os.makedirs(download_dir, exist_ok=True) else: download_dir = os.getcwd() # Use current working directory if not specified save_path = os.path.join(download_dir, filename) # Check if file already exists and is still valid (not too old) if skip_if_exists: cache_info = check_pdf_cache_age(save_path, max_cache_age_days) if cache_info['exists']: if cache_info['is_valid']: logger.info(f"Using valid cached PDF: {save_path}") logger.info(f"File size: {cache_info['size']} bytes, age: {cache_info['age_days']} days") return True, filename else: logger.info(f"PDF cache expired ({cache_info['age_days']} days > {max_cache_age_days} days) or corrupted, re-downloading...") else: logger.info(f"PDF not found in cache, downloading: {save_path}") logger.info(f"Downloading PDF: {pdf_url}") # Set request headers for PDF download pdf_headers = { 'Accept': 'application/pdf,*/*', 'Referer': 'https://www.nccn.org/', } # First, make a regular GET request to check the response response = await self.session.get(pdf_url, headers=pdf_headers, follow_redirects=True) logger.info(f"Response status: {response.status_code}") logger.info(f"Final URL: {response.url}") # Check if we were redirected to a login page if response.status_code == 200: content_type = response.headers.get('Content-Type', '') logger.info(f"Content-Type: {content_type}") # Check if this is actually a PDF if 'application/pdf' in content_type: # This is a PDF, save it directly with open(save_path, 'wb') as f: f.write(response.content) file_size = os.path.getsize(save_path) logger.info(f"PDF file saved to: {save_path}") logger.info(f"File size: {file_size} bytes") return True, filename elif 'text/html' in content_type: # This is HTML, likely a login page response_text = response.text if 'login' in response_text.lower() or 'log in' in response_text.lower(): logger.info("Login required detected, attempting automatic login...") # If login credentials are provided, attempt to log in login_username = username or self.username login_password = password or self.password if login_username and login_password: if await self.login(login_username, login_password, pdf_url): logger.info("Login successful, re-downloading PDF...") time.sleep(1) # Wait for login state to stabilize # Recursive call, but do not pass login credentials to avoid infinite loop return await self.download_pdf(pdf_url, download_dir=download_dir, skip_if_exists=skip_if_exists, max_cache_age_days=max_cache_age_days) else: logger.error("Automatic login failed.") return False, filename else: logger.error("Login required but username and password not provided.") return False, filename else: logger.warning("Received HTML response but no login form detected.") logger.debug(f"Response preview: {response_text[:500]}...") return False, filename else: logger.warning(f"Unexpected content type: {content_type}") return False, filename elif response.status_code == 302: # Handle redirect manually if needed redirect_url = response.headers.get('Location') logger.info(f"Received redirect to: {redirect_url}") # Check if redirect is to login page if redirect_url and 'login' in redirect_url.lower(): logger.info("Redirected to login page, attempting automatic login...") login_username = username or self.username login_password = password or self.password if login_username and login_password: if await self.login(login_username, login_password, pdf_url): logger.info("Login successful, re-downloading PDF...") time.sleep(1) return await self.download_pdf(pdf_url, download_dir=download_dir, skip_if_exists=skip_if_exists, max_cache_age_days=max_cache_age_days) else: logger.error("Automatic login failed.") return False, filename else: logger.error("Login required but username and password not provided.") return False, filename else: logger.error(f"Unexpected redirect to: {redirect_url}") return False, filename else: logger.error(f"Download failed, status code: {response.status_code}") return False, filename except Exception as e: logger.error(f"An error occurred during download: {str(e)}") return False, filename
- server.py:167-167 (registration)The @mcp.tool() decorator registers the download_pdf function as an MCP tool.@mcp.tool()