web_scrap.py•1.43 kB
import requests
from bs4 import BeautifulSoup
import time
import random
from fake_useragent import UserAgent
def extract_article(url):
headers = {
"User-Agent": UserAgent().random,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://www.google.com/",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
session = requests.Session()
for _ in range(3): # Try up to 3 times
try:
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")
# Extract the article content
article_content = soup.find("article")
if article_content:
content = article_content.get_text()
else:
content = soup.get_text()
return content.strip() # Remove unnecessary whitespace
except requests.RequestException as e:
print(f"Request failed: {e}")
time.sleep(random.uniform(1, 3)) # Wait before retrying
raise Exception("Failed to extract article after multiple attempts")