brightdata_html_search.py•5.24 kB
import streamlit as st
import ssl
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO
from fpdf import FPDF
# ========== CONFIGURAÇÃO DO PROXY ==========
PROXY = 'http://brd-customer-hl_c103e9b9-zone-serp_api1:prz02bbteqnk@brd.superproxy.io:33335'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0',
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
# ========== FUNÇÃO DE BUSCA ==========
def proxy_search_google(query: str):
ssl._create_default_https_context = ssl._create_unverified_context # ignora SSL
opener = urllib.request.build_opener(
urllib.request.ProxyHandler({'http': PROXY, 'https': PROXY})
)
opener.addheaders = [(k, v) for k, v in HEADERS.items()]
encoded_query = urllib.parse.quote(query)
url = f'https://www.google.com/search?q={encoded_query}&hl=pt'
response = opener.open(url)
html = response.read()
return html
# ========== PARSER DO HTML ==========
def extract_results_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
results = []
for result in soup.select('div.g'):
title_elem = result.select_one('h3')
link_elem = result.select_one('a[href]')
snippet_elem = result.select_one('.VwiC3b') or result.select_one('.IsZvec')
if title_elem and link_elem:
results.append({
"title": title_elem.get_text(),
"link": link_elem['href'],
"snippet": snippet_elem.get_text() if snippet_elem else ""
})
return pd.DataFrame(results)
# ========== GRÁFICO DE TENDÊNCIA ==========
def plot_trend(query):
trend_data = pd.DataFrame({
"Month": pd.date_range(start="2024-01-01", periods=6, freq='M'),
"Interest": [20, 35, 55, 70, 85, 100]
})
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(trend_data["Month"], trend_data["Interest"], marker='o')
ax.set_title(f"Tendência de Interesse: {query}")
ax.set_xlabel("Mês")
ax.set_ylabel("Interesse")
ax.grid(True)
buf = BytesIO()
plt.tight_layout()
fig.savefig(buf, format="png")
buf.seek(0)
return buf
# ========== PDF ==========
class PDF(FPDF):
def header(self):
self.set_font("Arial", "B", 12)
self.cell(0, 10, "Resultados da Pesquisa", ln=True, align="C")
def chapter_body(self, df):
self.set_font("Arial", "", 10)
for idx, row in df.iterrows():
self.multi_cell(0, 8, f"{idx+1}. {row['title']}\n{row['link']}\n{row['snippet']}\n", border=0)
self.ln(2)
def create_pdf(self, df):
self.add_page()
self.chapter_body(df)
def generate_pdf(df):
clean_df = df.replace({u"\u2019": "'", u"\u201c": '"', u"\u201d": '"'}, regex=True)
pdf = PDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.create_pdf(clean_df)
buf = BytesIO()
pdf.output(buf)
buf.seek(0)
return buf
# ========== STREAMLIT UI ==========
st.set_page_config(page_title="Google via Proxy Bright Data", layout="wide")
st.title("🔎 Pesquisa no Google via Proxy (Bright Data)")
query = st.text_input("Digite o termo de busca", "geoai")
if st.button("Pesquisar"):
with st.spinner("Buscando dados via proxy Bright Data..."):
try:
html = proxy_search_google(query)
df_links = extract_results_from_html(html)
if df_links.empty:
st.warning("Nenhum resultado encontrado.")
with st.expander("🔍 Ver HTML bruto retornado"):
st.code(html[:3000], language="html")
else:
st.success(f"{len(df_links)} resultados encontrados.")
st.dataframe(df_links, use_container_width=True)
st.subheader("📈 Tendência do Tema")
trend_buf = plot_trend(query)
st.image(trend_buf, caption="Gráfico simulado de interesse")
st.subheader("📥 Baixar dados")
csv_buf = BytesIO()
df_links.to_csv(csv_buf, index=False)
st.download_button("⬇️ CSV", data=csv_buf.getvalue(), file_name="links.csv", mime="text/csv")
excel_buf = BytesIO()
df_links.to_excel(excel_buf, index=False)
st.download_button("⬇️ Excel", data=excel_buf.getvalue(), file_name="links.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
st.download_button("⬇️ PNG (Gráfico)", data=trend_buf.getvalue(), file_name="trend.png", mime="image/png")
pdf_buf = generate_pdf(df_links)
st.download_button("⬇️ PDF", data=pdf_buf.getvalue(), file_name="links.pdf", mime="application/pdf")
except Exception as e:
st.error(f"Erro ao consultar via proxy: {e}")