"""
Company researcher for generating basic company context using Gemma3.
This module provides basic company research capabilities to generate
context and insights for case study generation.
"""
import logging
import re
import os
from typing import Dict, Any, List, Optional
from dotenv import load_dotenv
from gemma3_client import get_gemma3_client, Gemma3ClientError
from prompts import get_company_research_prompt, get_analysis_config
# Load environment variables from .env file
load_dotenv()
try:
from tavily import TavilyClient
TAVILY_AVAILABLE = True
except ImportError:
TAVILY_AVAILABLE = False
TavilyClient = None
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class CompanyResearcherError(Exception):
"""Base exception for company researcher errors."""
pass
class CompanyResearcher:
"""
Generates basic company research and context using Gemma3.
Provides industry-standard insights, challenges, and opportunities
based on company name and available context information.
"""
def __init__(self, gemma3_client=None, tavily_api_key=None):
"""
Initialize company researcher.
Args:
gemma3_client: Optional Gemma3Client instance. If None, uses global client.
tavily_api_key: Optional Tavily API key. If None, uses environment variable.
"""
self.gemma3 = gemma3_client or get_gemma3_client()
# Initialize Tavily client for web search
self.tavily_client = None
if TAVILY_AVAILABLE:
api_key = tavily_api_key or os.getenv("TAVILY_API_KEY")
if api_key:
try:
self.tavily_client = TavilyClient(api_key=api_key)
logger.info("Tavily client initialized for web research")
except Exception as e:
logger.warning(f"Failed to initialize Tavily client: {e}")
else:
logger.warning("Tavily API key not found. Company research will use LLM patterns only.")
else:
logger.warning("Tavily not installed. Install with: pip install tavily-python")
# Common industry patterns for context enhancement
self.industry_keywords = {
"Technology": ["tech", "software", "app", "platform", "digital", "ai", "ml", "cloud", "saas"],
"Healthcare": ["health", "medical", "pharma", "biotech", "clinic", "hospital", "therapeutics"],
"Finance": ["bank", "fintech", "finance", "financial", "payment", "insurance", "investment"],
"Retail": ["retail", "ecommerce", "shop", "store", "marketplace", "consumer", "brand"],
"Manufacturing": ["manufacturing", "industrial", "factory", "production", "automotive"],
"Energy": ["energy", "oil", "gas", "renewable", "solar", "wind", "power", "utilities"],
"Education": ["education", "learning", "school", "university", "training", "edtech"],
"Media": ["media", "entertainment", "content", "streaming", "publishing", "news"],
"Transportation": ["logistics", "shipping", "delivery", "transport", "mobility"],
"Real Estate": ["real estate", "property", "construction", "housing", "commercial"]
}
# Common business model patterns
self.business_model_indicators = {
"SaaS": ["software", "platform", "subscription", "cloud", "service"],
"E-commerce": ["online", "marketplace", "retail", "shop", "store"],
"Marketplace": ["marketplace", "platform", "connect", "match"],
"Consulting": ["consulting", "advisory", "services", "professional"],
"Manufacturing": ["manufacturing", "production", "factory", "industrial"],
"Media": ["content", "media", "publishing", "advertising"]
}
def _detect_industry(self, company_name: str, company_context: str = "") -> str:
"""
Detect likely industry based on company name and context.
Args:
company_name: Name of the company
company_context: Additional context about the company
Returns:
Detected industry or "General Business"
"""
search_text = f"{company_name} {company_context}".lower()
industry_scores = {}
for industry, keywords in self.industry_keywords.items():
score = sum(1 for keyword in keywords if keyword in search_text)
if score > 0:
industry_scores[industry] = score
if industry_scores:
return max(industry_scores, key=lambda x: industry_scores[x])
return "General Business"
def _detect_business_model(self, company_name: str, company_context: str = "") -> str:
"""
Detect likely business model based on company name and context.
Args:
company_name: Name of the company
company_context: Additional context about the company
Returns:
Detected business model or "Traditional Business"
"""
search_text = f"{company_name} {company_context}".lower()
model_scores = {}
for model, keywords in self.business_model_indicators.items():
score = sum(1 for keyword in keywords if keyword in search_text)
if score > 0:
model_scores[model] = score
if model_scores:
return max(model_scores, key=lambda x: model_scores[x])
return "Traditional Business"
def _validate_company_name(self, company_name: str) -> str:
"""
Validate and clean company name.
Args:
company_name: Raw company name
Returns:
Cleaned company name
Raises:
CompanyResearcherError: If company name is invalid
"""
if not company_name or not isinstance(company_name, str):
raise CompanyResearcherError("Company name is required and must be a string")
cleaned = company_name.strip()
if not cleaned:
raise CompanyResearcherError("Company name cannot be empty")
# Remove excessive whitespace
cleaned = re.sub(r'\s+', ' ', cleaned)
# Basic validation - should contain at least some letters
if not re.search(r'[a-zA-Z]', cleaned):
raise CompanyResearcherError("Company name must contain letters")
return cleaned
def _validate_research_response(self, response: Dict[str, Any]) -> Dict[str, Any]:
"""
Validate and sanitize Gemma3 research response.
Args:
response: Raw response from Gemma3
Returns:
Validated and structured response
"""
# Check for processing errors
if "error" in response:
logger.warning(f"Gemma3 research error: {response.get('error')}")
return self._create_fallback_research(response.get("raw_response", ""))
# Expected fields for company research
expected_fields = {
"company_profile": "",
"industry": "",
"business_model": "",
"challenges": [],
"opportunities": [],
"technology_needs": []
}
# Validate and fill missing fields
validated = {}
for field, default in expected_fields.items():
value = response.get(field, default)
# Ensure lists are actually lists
if field in ["challenges", "opportunities", "technology_needs"]:
if not isinstance(value, list):
if isinstance(value, str) and value:
value = [value]
else:
value = default
# Filter empty strings
value = [item for item in value if item and isinstance(item, str)]
# Ensure strings are strings
else:
if not isinstance(value, str):
value = str(value) if value else default
validated[field] = value
return validated
def _create_fallback_research(self, raw_response: str) -> Dict[str, Any]:
"""
Create a fallback research response when Gemma3 processing fails.
Args:
raw_response: Original response text
Returns:
Basic structured research
"""
return {
"company_profile": "Company research failed - manual review required",
"industry": "Unknown",
"business_model": "Unknown",
"challenges": ["Research processing failed - manual analysis needed"],
"opportunities": ["Potential opportunities require manual assessment"],
"technology_needs": ["Technology requirements need manual evaluation"],
"research_note": "Fallback response due to processing error",
"raw_response_sample": raw_response[:200] + "..." if len(raw_response) > 200 else raw_response
}
def _search_company_info(self, company_name: str) -> Dict[str, Any]:
"""
Search for real company information using Tavily.
Args:
company_name: Name of the company to research
Returns:
Search results and extracted information
"""
if not self.tavily_client:
return {"search_available": False, "fallback_reason": "Tavily not configured"}
try:
# Perform web search for company information
search_queries = [
f"{company_name} company overview business",
f"{company_name} products services industry",
f"{company_name} about company profile"
]
all_results = []
for query in search_queries:
try:
results = self.tavily_client.search(
query=query,
search_depth="basic",
max_results=3,
include_answer=True,
include_raw_content=False
)
all_results.extend(results.get("results", []))
# Use the answer if available
if results.get("answer"):
break
except Exception as e:
logger.warning(f"Search query failed '{query}': {e}")
continue
if not all_results:
return {"search_available": True, "results_found": False}
# Extract key information from search results
search_info = {
"search_available": True,
"results_found": True,
"search_summary": results.get("answer", ""),
"sources": [],
"content_snippets": []
}
for result in all_results[:5]: # Limit to top 5 results
search_info["sources"].append({
"title": result.get("title", ""),
"url": result.get("url", ""),
"score": result.get("score", 0)
})
content = result.get("content", "")
if content:
# Clean and truncate content
clean_content = re.sub(r'\s+', ' ', content).strip()
if len(clean_content) > 300:
clean_content = clean_content[:300] + "..."
search_info["content_snippets"].append(clean_content)
return search_info
except Exception as e:
logger.error(f"Tavily search failed: {e}")
return {"search_available": True, "search_error": str(e)}
def _enhance_context(self, company_name: str, company_context: str = "", search_info: Dict[str, Any] = None) -> str:
"""
Enhance company context with detected patterns and search results.
Args:
company_name: Name of the company
company_context: Existing context
search_info: Web search results from Tavily
Returns:
Enhanced context string
"""
enhanced_parts = []
if company_context:
enhanced_parts.append(company_context)
# Add search results if available
if search_info and search_info.get("results_found"):
if search_info.get("search_summary"):
enhanced_parts.append(f"Web search summary: {search_info['search_summary']}")
# Add content snippets
for snippet in search_info.get("content_snippets", [])[:2]: # Top 2 snippets
enhanced_parts.append(f"Company info: {snippet}")
# Add detected industry (fallback)
detected_industry = self._detect_industry(company_name, company_context)
if detected_industry != "General Business":
enhanced_parts.append(f"Detected industry: {detected_industry}")
# Add detected business model (fallback)
detected_model = self._detect_business_model(company_name, company_context)
if detected_model != "Traditional Business":
enhanced_parts.append(f"Detected business model: {detected_model}")
return ". ".join(enhanced_parts) if enhanced_parts else "No additional context available"
def research_company(self, company_name: str, company_context: str = "") -> Dict[str, Any]:
"""
Generate basic company research and context.
Args:
company_name: Name of the company to research
company_context: Optional additional context about the company
Returns:
Structured company research and insights
Raises:
CompanyResearcherError: If research fails critically
"""
try:
# Validate inputs
validated_name = self._validate_company_name(company_name)
logger.info(f"Researching company: {validated_name}")
# Perform web search for real company information
search_info = self._search_company_info(validated_name)
# Enhance context with search results
enhanced_context = self._enhance_context(validated_name, company_context, search_info)
# Generate prompt and process with Gemma3
prompt = get_company_research_prompt(validated_name, enhanced_context)
response = self.gemma3.process_with_json(prompt, "company_research")
# Validate and structure response
validated_response = self._validate_research_response(response)
# Add metadata and detected information
validated_response.update({
"company_name": validated_name,
"provided_context": company_context,
"enhanced_context": enhanced_context,
"detected_industry": self._detect_industry(validated_name, company_context),
"detected_business_model": self._detect_business_model(validated_name, company_context),
"research_success": True
})
logger.info("Company research completed successfully")
return validated_response
except Gemma3ClientError as e:
logger.error(f"Gemma3 client error: {e}")
# Return fallback response instead of failing
fallback = self._create_fallback_research(str(e))
fallback["company_name"] = company_name
fallback["research_success"] = False
fallback["error_type"] = "gemma3_error"
return fallback
except CompanyResearcherError:
raise # Re-raise company researcher specific errors
except Exception as e:
logger.error(f"Company research failed: {e}")
raise CompanyResearcherError(f"Research failed: {e}")
async def research_company_async(self, company_name: str, company_context: str = "") -> Dict[str, Any]:
"""
Generate basic company research asynchronously.
Args:
company_name: Name of the company to research
company_context: Optional additional context about the company
Returns:
Structured company research and insights
"""
try:
# Validate inputs
validated_name = self._validate_company_name(company_name)
logger.info(f"Researching company: {validated_name} [async]")
# Perform web search for real company information
search_info = self._search_company_info(validated_name)
# Enhance context with search results
enhanced_context = self._enhance_context(validated_name, company_context, search_info)
# Generate prompt and process with Gemma3
prompt = get_company_research_prompt(validated_name, enhanced_context)
response = await self.gemma3.process_with_json_async(prompt, "company_research")
# Validate and structure response
validated_response = self._validate_research_response(response)
# Add metadata and detected information
validated_response.update({
"company_name": validated_name,
"provided_context": company_context,
"enhanced_context": enhanced_context,
"detected_industry": self._detect_industry(validated_name, company_context),
"detected_business_model": self._detect_business_model(validated_name, company_context),
"research_success": True
})
logger.info("Company research completed successfully [async]")
return validated_response
except Gemma3ClientError as e:
logger.error(f"Gemma3 client error: {e}")
# Return fallback response instead of failing
fallback = self._create_fallback_research(str(e))
fallback["company_name"] = company_name
fallback["research_success"] = False
fallback["error_type"] = "gemma3_error"
return fallback
except CompanyResearcherError:
raise # Re-raise company researcher specific errors
except Exception as e:
logger.error(f"Company research failed: {e}")
raise CompanyResearcherError(f"Research failed: {e}")
def research_multiple_companies(self, companies: List[Dict[str, str]]) -> List[Dict[str, Any]]:
"""
Research multiple companies efficiently.
Args:
companies: List of companies with 'name' and optional 'context' keys
Returns:
List of company research results
"""
results = []
for i, company in enumerate(companies):
try:
name = company.get("name", "")
context = company.get("context", "")
result = self.research_company(name, context)
result["company_index"] = i
results.append(result)
except Exception as e:
logger.error(f"Failed to research company {i}: {e}")
# Add error result
results.append({
"company_index": i,
"research_success": False,
"error": str(e),
"company_name": company.get("name", "Unknown")
})
return results
def generate_industry_insights(self, industry: str) -> Dict[str, Any]:
"""
Generate general insights for a specific industry.
Args:
industry: Industry name
Returns:
Industry-specific insights
"""
# Use the company research prompt with industry focus
prompt = f"""
Generate industry insights for: {industry}
Return JSON with this structure:
{{
"industry_overview": "brief description of the industry",
"market_trends": ["current trends affecting this industry"],
"common_challenges": ["typical challenges companies in this industry face"],
"growth_opportunities": ["potential growth areas"],
"technology_trends": ["relevant technology trends"],
"success_factors": ["key factors for success in this industry"]
}}
Focus on actionable business insights.
"""
try:
response = self.gemma3.process_with_json(prompt, "company_research")
# Add metadata
if "error" not in response:
response["industry"] = industry
response["insight_type"] = "industry_analysis"
return response
except Exception as e:
logger.error(f"Industry insights generation failed: {e}")
return {
"error": f"Failed to generate industry insights: {e}",
"industry": industry,
"insight_type": "industry_analysis"
}
def health_check(self) -> Dict[str, Any]:
"""
Perform health check on the company researcher.
Returns:
Health status information
"""
try:
# Test with a simple company research
test_prompt = "Return JSON: {\"company_profile\": \"test company\", \"industry\": \"technology\"}"
response = self.gemma3.generate(test_prompt, "company_research")
return {
"status": "healthy",
"gemma3_status": "accessible",
"test_response": response[:100] + "..." if len(response) > 100 else response
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
"gemma3_status": "error"
}