async def download_pdf(self, pdf_url, download_dir=None, username=None, password=None, skip_if_exists=True, max_cache_age_days=PDF_CACHE_MAX_AGE_DAYS):
"""
Downloads a PDF file, automatically logging in if required.
Args:
pdf_url (str): URL of the PDF file.
download_dir (str, optional): Directory to save the PDF. Defaults to current directory.
username (str, optional): Username (email address), required if not already logged in.
password (str, optional): Password, required if not already logged in.
skip_if_exists (bool): Whether to skip download if the file already exists. Defaults to True.
max_cache_age_days (int): Maximum cache file validity period (days). Defaults to PDF_CACHE_MAX_AGE_DAYS.
Returns:
tuple: (success (bool), saved_filename (str))
"""
try:
# Automatically extract filename from URL
filename = os.path.basename(pdf_url)
if not filename or not filename.endswith('.pdf'):
filename = 'nccn_guideline.pdf'
if download_dir:
os.makedirs(download_dir, exist_ok=True)
else:
download_dir = os.getcwd() # Use current working directory if not specified
save_path = os.path.join(download_dir, filename)
# Check if file already exists and is still valid (not too old)
if skip_if_exists:
cache_info = check_pdf_cache_age(save_path, max_cache_age_days)
if cache_info['exists']:
if cache_info['is_valid']:
logger.info(f"Using valid cached PDF: {save_path}")
logger.info(f"File size: {cache_info['size']} bytes, age: {cache_info['age_days']} days")
return True, filename
else:
logger.info(f"PDF cache expired ({cache_info['age_days']} days > {max_cache_age_days} days) or corrupted, re-downloading...")
else:
logger.info(f"PDF not found in cache, downloading: {save_path}")
logger.info(f"Downloading PDF: {pdf_url}")
# Set request headers for PDF download
pdf_headers = {
'Accept': 'application/pdf,*/*',
'Referer': 'https://www.nccn.org/',
}
# First, make a regular GET request to check the response
response = await self.session.get(pdf_url, headers=pdf_headers, follow_redirects=True)
logger.info(f"Response status: {response.status_code}")
logger.info(f"Final URL: {response.url}")
# Check if we were redirected to a login page
if response.status_code == 200:
content_type = response.headers.get('Content-Type', '')
logger.info(f"Content-Type: {content_type}")
# Check if this is actually a PDF
if 'application/pdf' in content_type:
# This is a PDF, save it directly
with open(save_path, 'wb') as f:
f.write(response.content)
file_size = os.path.getsize(save_path)
logger.info(f"PDF file saved to: {save_path}")
logger.info(f"File size: {file_size} bytes")
return True, filename
elif 'text/html' in content_type:
# This is HTML, likely a login page
response_text = response.text
if 'login' in response_text.lower() or 'log in' in response_text.lower():
logger.info("Login required detected, attempting automatic login...")
# If login credentials are provided, attempt to log in
login_username = username or self.username
login_password = password or self.password
if login_username and login_password:
if await self.login(login_username, login_password, pdf_url):
logger.info("Login successful, re-downloading PDF...")
time.sleep(1) # Wait for login state to stabilize
# Recursive call, but do not pass login credentials to avoid infinite loop
return await self.download_pdf(pdf_url, download_dir=download_dir, skip_if_exists=skip_if_exists, max_cache_age_days=max_cache_age_days)
else:
logger.error("Automatic login failed.")
return False, filename
else:
logger.error("Login required but username and password not provided.")
return False, filename
else:
logger.warning("Received HTML response but no login form detected.")
logger.debug(f"Response preview: {response_text[:500]}...")
return False, filename
else:
logger.warning(f"Unexpected content type: {content_type}")
return False, filename
elif response.status_code == 302:
# Handle redirect manually if needed
redirect_url = response.headers.get('Location')
logger.info(f"Received redirect to: {redirect_url}")
# Check if redirect is to login page
if redirect_url and 'login' in redirect_url.lower():
logger.info("Redirected to login page, attempting automatic login...")
login_username = username or self.username
login_password = password or self.password
if login_username and login_password:
if await self.login(login_username, login_password, pdf_url):
logger.info("Login successful, re-downloading PDF...")
time.sleep(1)
return await self.download_pdf(pdf_url, download_dir=download_dir, skip_if_exists=skip_if_exists, max_cache_age_days=max_cache_age_days)
else:
logger.error("Automatic login failed.")
return False, filename
else:
logger.error("Login required but username and password not provided.")
return False, filename
else:
logger.error(f"Unexpected redirect to: {redirect_url}")
return False, filename
else:
logger.error(f"Download failed, status code: {response.status_code}")
return False, filename
except Exception as e:
logger.error(f"An error occurred during download: {str(e)}")
return False, filename