"""
Data Extraction Module
AI-powered extraction of subscription details from email content
"""
import re
from datetime import datetime
def extract_subscription_data(email_data):
"""
Extract structured subscription data from email content
Args:
email_data: Dictionary with email details (subject, body, from, date)
"""
try:
subject = email_data.get('subject', '')
body = email_data.get('body', '')
from_email = email_data.get('from', '')
email_date = email_data.get('date', '')
email_id = email_data.get('id', '')
service = extract_service_name(subject, from_email, body)
amount = extract_amount(subject, body)
renewal_date = extract_renewal_date(subject, body)
plan_name = extract_plan_name(subject, body)
cycle = extract_cycle(subject, body)
if not service or not amount or not renewal_date:
return {
'success': False,
'error': 'Failed to extract required fields (service, amount, or renewal_date)',
'partial_data': {
'service': service,
'amount': amount,
'renewal_date': renewal_date
}
}
return {
'success': True,
'data': {
'service': service,
'amount': amount,
'renewal_date': renewal_date,
'plan_name': plan_name,
'cycle': cycle,
'email_id': email_id,
'email_date': email_date
}
}
except Exception as error:
return {
'success': False,
'error': str(error)
}
def extract_service_name(subject, from_email, body):
"""Extract service name from email"""
common_services = [
'Netflix', 'Spotify', 'Amazon', 'AWS', 'GitHub', 'Adobe', 'Microsoft',
'Google', 'Apple', 'Dropbox', 'Zoom', 'Slack', 'Salesforce', 'LinkedIn'
]
for service in common_services:
if service.lower() in subject.lower() or service.lower() in from_email.lower():
return service
from_match = re.search(r'<(.+?)@(.+?)>', from_email)
if from_match:
domain = from_match.group(2).split('.')[0]
return domain.capitalize()
return 'Unknown Service'
def extract_amount(subject, body):
"""Extract amount from email"""
amount_patterns = [
r'\$(\d+\.\d{2})',
r'(\d+\.\d{2})\s*USD',
r'total[:\s]+\$?(\d+\.\d{2})',
r'amount[:\s]+\$?(\d+\.\d{2})',
r'price[:\s]+\$?(\d+\.\d{2})'
]
text = f"{subject} {body}"
for pattern in amount_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return float(match.group(1))
return None
def extract_renewal_date(subject, body):
"""Extract renewal date from email"""
date_patterns = [
r'renew(?:s|al)? on (\w+ \d{1,2},? \d{4})',
r'(\d{1,2}/\d{1,2}/\d{4})',
r'(\d{4}-\d{2}-\d{2})',
r'(\w+ \d{1,2},? \d{4})'
]
text = f"{subject} {body}"
for pattern in date_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
date_str = match.group(1)
for fmt in ['%B %d, %Y', '%b %d, %Y', '%m/%d/%Y', '%Y-%m-%d']:
try:
parsed_date = datetime.strptime(date_str, fmt)
return parsed_date.strftime('%Y-%m-%d')
except ValueError:
continue
except:
continue
return None
def extract_plan_name(subject, body):
"""Extract plan name from email"""
plan_patterns = [
r'(Standard|Premium|Basic|Pro|Enterprise|Individual|Family|Business)\s+plan',
r'plan[:\s]+(Standard|Premium|Basic|Pro|Enterprise|Individual|Family|Business)'
]
text = f"{subject} {body}"
for pattern in plan_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1).capitalize()
return 'Standard'
def extract_cycle(subject, body):
"""Extract billing cycle from email"""
text = f"{subject} {body}".lower()
if 'annual' in text or 'yearly' in text or 'year' in text:
return 'yearly'
elif 'month' in text:
return 'monthly'
elif 'quarter' in text:
return 'quarterly'
return 'monthly'