CFM Tips - Cost Optimization MCP Server

nat_gateway_optimization.py•50.6 KiB

""" NAT Gateway Cost Optimization Playbook This module provides evidence-based analysis and optimization recommendations for AWS NAT Gateways, focusing on identifying unused, underutilized, and redundant NAT Gateways to reduce costs. Key optimization areas: 1. Unused NAT Gateways - Not referenced by any route tables (highest priority) 2. Underutilized NAT Gateways - Low data transfer based on CloudWatch metrics or Trusted Advisor 3. Redundant NAT Gateways - Multiple NAT Gateways in the same Availability Zone Features: - Smart sequential exclusion: Each check excludes already-optimized resources - Accurate pricing: Uses AWS Pricing API with regional fallback - Account context: Shows total NAT Gateway count for verification - Cost transparency: Real savings calculations without speculation """ import boto3 import json from datetime import datetime, timedelta from typing import Dict, List, Any, Optional, Tuple from botocore.exceptions import ClientError, NoCredentialsError import logging # Configure logging logger = logging.getLogger(__name__) # Import pricing service try: from services.pricing import get_nat_gateway_pricing except ImportError: logger.warning("Could not import pricing service, using fallback pricing") def get_nat_gateway_pricing(region: str = 'us-east-1') -> Dict[str, Any]: """Fallback pricing function if service is not available.""" fallback_pricing = {'us-east-1': 0.045, 'us-east-2': 0.045, 'us-west-1': 0.048, 'us-west-2': 0.045} hourly_price = fallback_pricing.get(region, 0.045) return { 'status': 'fallback', 'region': region, 'hourly_price': hourly_price, 'monthly_price': round(hourly_price * 24 * 30.44, 2), 'data_processing_price_per_gb': 0.045, 'source': 'fallback_pricing' } def get_nat_gateway_monthly_cost(region: str = 'us-east-1') -> float: """ Get the monthly cost for a NAT Gateway in the specified region using the pricing service. Args: region: AWS region Returns: Monthly cost in USD """ try: pricing_info = get_nat_gateway_pricing(region) return pricing_info.get('monthly_price', 32.40) # Default to us-east-1 pricing except Exception as e: logger.warning(f"Error getting NAT Gateway pricing for {region}: {e}") # Fallback to us-east-1 pricing return 32.40 def count_all_nat_gateways(region: str = None) -> Dict[str, Any]: """ Count all NAT Gateways across all regions to provide account-level context. Falls back to single region if permissions are limited. Args: region: Specific region to check (optional, defaults to multi-region scan) Returns: Dictionary with total count and regional breakdown """ try: # Try to get all regions first regions = [] try: ec2 = boto3.client('ec2', region_name='us-east-1') regions_response = ec2.describe_regions() regions = [region['RegionName'] for region in regions_response['Regions']] logger.info(f"Scanning {len(regions)} regions for NAT Gateways") except Exception as e: # If we can't get all regions due to permissions, fall back to common regions if region: regions = [region] logger.warning(f"Limited permissions - scanning only specified region: {region}") else: regions = ['us-east-1', 'us-west-2', 'eu-west-1'] # Common regions logger.warning(f"Limited permissions - scanning common regions only: {regions}") total_count = 0 regional_breakdown = {} permission_limited = False for region_name in regions: try: regional_ec2 = boto3.client('ec2', region_name=region_name) response = regional_ec2.describe_nat_gateways() # Count active NAT Gateways (available, pending) active_count = sum(1 for nat_gw in response.get('NatGateways', []) if nat_gw['State'] in ['available', 'pending']) if active_count > 0: regional_breakdown[region_name] = active_count total_count += active_count except Exception as e: if "UnauthorizedOperation" in str(e): permission_limited = True logger.warning(f"Could not check NAT Gateways in region {region_name}: {str(e)}") continue # Create appropriate message based on scan scope if permission_limited: message = f"Found {total_count} NAT Gateways in {len(regional_breakdown)} accessible regions (limited permissions - may not reflect full account)" else: message = f"Found {total_count} NAT Gateways across {len(regional_breakdown)} regions" logger.info(f"NAT Gateway count: {total_count} across {len(regional_breakdown)} regions") return { 'total_count': total_count, 'regions_with_nat_gateways': len(regional_breakdown), 'regional_breakdown': regional_breakdown, 'permission_limited': permission_limited, 'scanned_regions': len(regions), 'message': message } except Exception as e: logger.error(f"Error counting NAT Gateways: {str(e)}") return { 'total_count': 0, 'regions_with_nat_gateways': 0, 'regional_breakdown': {}, 'permission_limited': True, 'scanned_regions': 0, 'message': f"Error counting NAT Gateways: {str(e)}" } def get_nat_gateways(region: str = 'us-east-1') -> List[Dict[str, Any]]: """ Retrieve all NAT Gateways in the specified region. Args: region: AWS region to analyze Returns: List of NAT Gateway details """ try: ec2 = boto3.client('ec2', region_name=region) response = ec2.describe_nat_gateways() nat_gateways = [] for nat_gw in response.get('NatGateways', []): if nat_gw['State'] in ['available', 'pending']: nat_gateways.append({ 'nat_gateway_id': nat_gw['NatGatewayId'], 'vpc_id': nat_gw['VpcId'], 'subnet_id': nat_gw['SubnetId'], 'state': nat_gw['State'], 'connectivity_type': nat_gw.get('ConnectivityType', 'public'), 'created_time': nat_gw['CreateTime'], 'tags': {tag['Key']: tag['Value'] for tag in nat_gw.get('Tags', [])}, 'addresses': nat_gw.get('NatGatewayAddresses', []) }) logger.info(f"Found {len(nat_gateways)} NAT Gateways in {region}") return nat_gateways except Exception as e: logger.error(f"Error retrieving NAT Gateways: {str(e)}") return [] def get_nat_gateway_metrics(nat_gateway_id: str, region: str = 'us-east-1', lookback_days: int = 14) -> Dict[str, Any]: """ Get CloudWatch metrics for a NAT Gateway (optimized for cost efficiency). Collects only the 3 metrics actually used in analysis, reducing CloudWatch API costs by 75% compared to collecting all available NAT Gateway metrics. Args: nat_gateway_id: NAT Gateway ID region: AWS region lookback_days: Number of days to look back for metrics Returns: Dictionary containing essential metrics data: - BytesInFromSource: Total ingress data transfer - BytesOutToDestination: Total egress data transfer - ActiveConnectionCount: Average active connections """ try: cloudwatch = boto3.client('cloudwatch', region_name=region) end_time = datetime.utcnow() start_time = end_time - timedelta(days=lookback_days) metrics = {} # Define metrics to collect - enhanced for comprehensive analysis when Trusted Advisor unavailable # Base metrics (always collected when CloudWatch is used) base_metrics = [ ('BytesInFromSource', 'Sum'), # Used for data transfer calculation (ingress) ('BytesOutToDestination', 'Sum'), # Used for data transfer calculation (egress) ('ActiveConnectionCount', 'Average') # Used for connection activity analysis ] # Enhanced metrics for capacity and performance optimization enhanced_metrics = [ ('ErrorPortAllocation', 'Sum'), # For secondary IP address recommendations ('PacketsDropCount', 'Sum'), # For capacity optimization detection ] # Use enhanced metrics for comprehensive analysis metric_queries = base_metrics + enhanced_metrics for metric_name, statistic in metric_queries: try: response = cloudwatch.get_metric_statistics( Namespace='AWS/NatGateway', MetricName=metric_name, Dimensions=[ { 'Name': 'NatGatewayId', 'Value': nat_gateway_id } ], StartTime=start_time, EndTime=end_time, Period=3600, # 1 hour periods Statistics=[statistic] ) datapoints = response.get('Datapoints', []) if datapoints: if statistic == 'Sum': total_value = sum(dp[statistic] for dp in datapoints) else: total_value = sum(dp[statistic] for dp in datapoints) / len(datapoints) metrics[metric_name] = { 'value': total_value, 'unit': datapoints[0].get('Unit', 'None'), 'datapoints_count': len(datapoints) } else: metrics[metric_name] = {'value': 0, 'unit': 'None', 'datapoints_count': 0} except Exception as e: logger.warning(f"Could not retrieve {metric_name} for {nat_gateway_id}: {str(e)}") metrics[metric_name] = {'value': 0, 'unit': 'None', 'datapoints_count': 0} return metrics except Exception as e: logger.error(f"Error retrieving metrics for NAT Gateway {nat_gateway_id}: {str(e)}") return {} def get_route_tables_using_nat_gateway(nat_gateway_id: str, region: str = 'us-east-1') -> List[Dict[str, Any]]: """ Find route tables that use the specified NAT Gateway. Args: nat_gateway_id: NAT Gateway ID region: AWS region Returns: List of route tables using the NAT Gateway """ try: ec2 = boto3.client('ec2', region_name=region) response = ec2.describe_route_tables() using_route_tables = [] for rt in response.get('RouteTables', []): for route in rt.get('Routes', []): if route.get('NatGatewayId') == nat_gateway_id: using_route_tables.append({ 'route_table_id': rt['RouteTableId'], 'vpc_id': rt['VpcId'], 'destination_cidr': route.get('DestinationCidrBlock', 'N/A'), 'associations': rt.get('Associations', []), 'tags': {tag['Key']: tag['Value'] for tag in rt.get('Tags', [])} }) break return using_route_tables except Exception as e: logger.error(f"Error finding route tables for NAT Gateway {nat_gateway_id}: {str(e)}") return [] def get_trusted_advisor_idle_nat_gateways() -> Dict[str, Any]: """ Get idle NAT Gateway findings from AWS Trusted Advisor. Returns: Dictionary containing Trusted Advisor findings for idle NAT Gateways """ try: # Trusted Advisor requires Support API in us-east-1 support_client = boto3.client('support', region_name='us-east-1') # Get all Trusted Advisor checks checks_response = support_client.describe_trusted_advisor_checks(language='en') # Find the idle NAT Gateway check idle_nat_check = None for check in checks_response['checks']: if 'NAT Gateway' in check['name'] and ('idle' in check['name'].lower() or 'underutilized' in check['name'].lower()): idle_nat_check = check break if not idle_nat_check: logger.warning("Trusted Advisor idle NAT Gateway check not found") return { 'status': 'not_available', 'message': 'Trusted Advisor idle NAT Gateway check not available', 'idle_nat_gateways': [], 'count': 0 } # Get the check results check_result = support_client.describe_trusted_advisor_check_result( checkId=idle_nat_check['id'], language='en' ) result_data = check_result['result'] idle_nat_gateways = [] if result_data['status'] in ['warning', 'error'] and result_data.get('flaggedResources'): for resource in result_data['flaggedResources']: # Parse Trusted Advisor resource data # Format varies but typically includes: Region, NAT Gateway ID, VPC ID, estimated monthly savings resource_data = resource.get('metadata', []) if len(resource_data) >= 4: idle_nat_gateways.append({ 'region': resource_data[0] if len(resource_data) > 0 else 'unknown', 'nat_gateway_id': resource_data[1] if len(resource_data) > 1 else 'unknown', 'vpc_id': resource_data[2] if len(resource_data) > 2 else 'unknown', 'estimated_monthly_savings': float(resource_data[3].replace('$', '').replace(',', '')) if len(resource_data) > 3 and resource_data[3] else 0.0, 'status': resource.get('status', 'flagged'), 'resource_id': resource.get('resourceId', ''), 'is_suppressed': resource.get('isSuppressed', False), 'source': 'trusted_advisor', 'check_name': idle_nat_check['name'], 'recommendation': 'Remove idle NAT Gateway identified by Trusted Advisor' }) total_savings = sum(nat.get('estimated_monthly_savings', 0) for nat in idle_nat_gateways) return { 'status': 'success', 'check_name': idle_nat_check['name'], 'check_status': result_data['status'], 'idle_nat_gateways': idle_nat_gateways, 'count': len(idle_nat_gateways), 'total_potential_monthly_savings': round(total_savings, 2), 'message': f"Trusted Advisor found {len(idle_nat_gateways)} idle NAT Gateways with potential savings of ${total_savings:.2f}/month" } except Exception as e: logger.error(f"Error getting Trusted Advisor idle NAT Gateways: {str(e)}") return { 'status': 'error', 'message': f"Error accessing Trusted Advisor: {str(e)}", 'idle_nat_gateways': [], 'count': 0, 'total_potential_monthly_savings': 0 } def analyze_underutilized_nat_gateways(region: str = 'us-east-1', data_transfer_threshold_gb: float = 1.0, lookback_days: int = 14, use_trusted_advisor: bool = True, zero_cost_mode: bool = True, include_account_context: bool = True) -> Dict[str, Any]: """ Identify underutilized NAT Gateways using multiple data sources with cost optimization. Args: region: AWS region to analyze data_transfer_threshold_gb: Minimum GB of data transfer to consider utilized lookback_days: Number of days to analyze use_trusted_advisor: Whether to use Trusted Advisor findings (recommended) zero_cost_mode: If True, only use CloudWatch when Trusted Advisor fails/unavailable (default: True) Returns: Analysis results with underutilized NAT Gateways Cost Optimization: - When zero_cost_mode=True and Trusted Advisor has data: $0 cost - When zero_cost_mode=True and Trusted Advisor unavailable: CloudWatch costs apply - When zero_cost_mode=False: Always runs both checks (higher cost, more coverage) """ try: # Handle None region if region is None: region = 'us-east-1' underutilized = [] data_sources = [] # Get account-wide context if requested account_context = {} if include_account_context: logger.info("Getting account-wide NAT Gateway count for context") account_nat_gw_count = count_all_nat_gateways(region) account_context = { 'total_nat_gateways_in_account': account_nat_gw_count.get('total_count', 0), 'regions_with_nat_gateways': account_nat_gw_count.get('regions_with_nat_gateways', 0), 'regional_breakdown': account_nat_gw_count.get('regional_breakdown', {}), 'permission_limited': account_nat_gw_count.get('permission_limited', False), 'scanned_regions': account_nat_gw_count.get('scanned_regions', 0), 'context_message': account_nat_gw_count.get('message', 'Unable to determine account context') } # Method 1: Use Trusted Advisor (most reliable, zero cost) trusted_advisor_result = None trusted_advisor_has_data = False if use_trusted_advisor: try: trusted_advisor_result = get_trusted_advisor_idle_nat_gateways() if trusted_advisor_result['status'] == 'success' and trusted_advisor_result.get('count', 0) > 0: # Trusted Advisor has findings - use them and skip CloudWatch trusted_advisor_has_data = True data_sources.append('trusted_advisor') logger.info(f"Trusted Advisor found {trusted_advisor_result['count']} idle NAT Gateways - skipping CloudWatch analysis for zero cost") # Add Trusted Advisor findings for ta_nat in trusted_advisor_result['idle_nat_gateways']: if ta_nat['region'] == region or region == 'all': underutilized.append({ 'nat_gateway_id': ta_nat['nat_gateway_id'], 'vpc_id': ta_nat['vpc_id'], 'region': ta_nat['region'], 'estimated_monthly_cost': ta_nat['estimated_monthly_savings'], 'potential_monthly_savings': ta_nat['estimated_monthly_savings'], 'source': 'trusted_advisor', 'check_name': ta_nat['check_name'], 'recommendation': ta_nat['recommendation'], 'confidence': 'high', # Trusted Advisor has high confidence 'analysis_method': 'aws_trusted_advisor' }) elif trusted_advisor_result['status'] == 'success' and trusted_advisor_result.get('count', 0) == 0: # Trusted Advisor available but found no idle NAT Gateways logger.info("Trusted Advisor found no idle NAT Gateways - all NAT Gateways appear to be properly utilized") trusted_advisor_has_data = True data_sources.append('trusted_advisor') else: logger.warning(f"Trusted Advisor not available: {trusted_advisor_result.get('message', 'Unknown error')}") except Exception as ta_error: logger.warning(f"Could not use Trusted Advisor: {str(ta_error)}") # Method 2: CloudWatch metrics analysis (cost-optimized execution) # In zero_cost_mode: ONLY run if Trusted Advisor failed or unavailable # In normal mode: Always run as supplement to Trusted Advisor should_run_cloudwatch = ( not zero_cost_mode or # Always run if zero_cost_mode disabled not trusted_advisor_has_data # Or run if Trusted Advisor failed/unavailable ) if should_run_cloudwatch: if zero_cost_mode: logger.info("Running CloudWatch metrics analysis (incurs costs) because Trusted Advisor unavailable or failed") else: logger.info("Running CloudWatch metrics analysis as supplement to Trusted Advisor (normal mode)") try: nat_gateways = get_nat_gateways(region) data_sources.append('cloudwatch_metrics') for nat_gw in nat_gateways: nat_gateway_id = nat_gw['nat_gateway_id'] metrics = get_nat_gateway_metrics(nat_gateway_id, region, lookback_days) # Calculate total data transfer in GB bytes_in = metrics.get('BytesInFromSource', {}).get('value', 0) bytes_out = metrics.get('BytesOutToDestination', {}).get('value', 0) total_bytes = bytes_in + bytes_out total_gb = total_bytes / (1024**3) # Convert to GB # Get route table information route_tables = get_route_tables_using_nat_gateway(nat_gateway_id, region) # Calculate estimated monthly cost (NAT Gateway pricing: correct regional pricing + data processing) monthly_base_cost = get_nat_gateway_monthly_cost(region) data_processing_cost = (total_bytes / (1024**3)) * 0.045 # $0.045 per GB processed estimated_monthly_cost = monthly_base_cost + (data_processing_cost * 30 / lookback_days) if total_gb < data_transfer_threshold_gb: underutilized.append({ 'nat_gateway_id': nat_gateway_id, 'vpc_id': nat_gw['vpc_id'], 'subnet_id': nat_gw['subnet_id'], 'state': nat_gw['state'], 'connectivity_type': nat_gw['connectivity_type'], 'created_time': nat_gw['created_time'], 'tags': nat_gw['tags'], 'total_data_transfer_gb': round(total_gb, 4), 'bytes_in': bytes_in, 'bytes_out': bytes_out, 'active_connections_avg': metrics.get('ActiveConnectionCount', {}).get('value', 0), 'route_tables_count': len(route_tables), 'route_tables': route_tables, 'estimated_monthly_cost': round(estimated_monthly_cost, 2), 'potential_monthly_savings': round(monthly_base_cost, 2) if total_gb == 0 else round(monthly_base_cost * 0.5, 2), 'analysis_period_days': lookback_days, 'source': 'cloudwatch_metrics', 'confidence': 'medium' if total_gb == 0 else 'low', 'analysis_method': 'cloudwatch_data_transfer_analysis', 'recommendation': 'Consider removing if no traffic' if total_gb == 0 else 'Monitor usage patterns - low data transfer detected' }) except Exception as cw_error: logger.warning(f"CloudWatch metrics analysis failed: {str(cw_error)}") else: logger.info("💰 ZERO COST: Skipping CloudWatch metrics analysis - Trusted Advisor provided complete results") total_potential_savings = sum(nat.get('potential_monthly_savings', 0) for nat in underutilized) # Calculate analysis cost cloudwatch_api_calls = 0 if 'cloudwatch_metrics' in data_sources: cloudwatch_api_calls = len([nat for nat in underutilized if nat.get('source') != 'trusted_advisor']) * 3 analysis_cost = cloudwatch_api_calls * 0.00001 # $0.01 per 1000 calls result = { 'status': 'success', 'region': region, 'analysis_period_days': lookback_days, 'data_transfer_threshold_gb': data_transfer_threshold_gb, 'data_sources': data_sources, 'trusted_advisor_available': trusted_advisor_result is not None and trusted_advisor_result['status'] == 'success', 'zero_cost_mode': zero_cost_mode, 'cost_analysis': { 'cloudwatch_api_calls': cloudwatch_api_calls, 'estimated_analysis_cost': round(analysis_cost, 6), 'zero_cost_achieved': analysis_cost == 0, 'cost_savings_vs_potential': f"Analysis cost ${analysis_cost:.6f} vs potential savings ${total_potential_savings:.2f}" }, 'underutilized_nat_gateways': underutilized, 'count': len(underutilized), 'total_potential_monthly_savings': round(total_potential_savings, 2), 'message': f"Found {len(underutilized)} underutilized NAT Gateways with potential savings of ${total_potential_savings:.2f}/month using {', '.join(data_sources)} (analysis cost: ${analysis_cost:.6f})" } # Add account context if requested if include_account_context and account_context: result['account_context'] = account_context result['message'] = f"{result['message']} | Account has {account_context.get('total_nat_gateways_in_account', 0)} total NAT Gateways" return result except Exception as e: logger.error(f"Error analyzing underutilized NAT Gateways: {str(e)}") return { 'status': 'error', 'message': f"Error analyzing NAT Gateways: {str(e)}", 'underutilized_nat_gateways': [], 'count': 0, 'total_potential_monthly_savings': 0 } def analyze_redundant_nat_gateways(region: str = 'us-east-1', include_account_context: bool = True) -> Dict[str, Any]: """ Identify potentially redundant NAT Gateways in the same availability zone. Args: region: AWS region to analyze Returns: Analysis results with potentially redundant NAT Gateways """ try: # Handle None region if region is None: region = 'us-east-1' # Get account-wide context if requested account_context = {} if include_account_context: logger.info("Getting account-wide NAT Gateway count for context") account_nat_gw_count = count_all_nat_gateways(region) account_context = { 'total_nat_gateways_in_account': account_nat_gw_count.get('total_count', 0), 'regions_with_nat_gateways': account_nat_gw_count.get('regions_with_nat_gateways', 0), 'regional_breakdown': account_nat_gw_count.get('regional_breakdown', {}), 'permission_limited': account_nat_gw_count.get('permission_limited', False), 'scanned_regions': account_nat_gw_count.get('scanned_regions', 0), 'context_message': account_nat_gw_count.get('message', 'Unable to determine account context') } ec2 = boto3.client('ec2', region_name=region) nat_gateways = get_nat_gateways(region) # Get subnet AZ information subnet_ids = [nat_gw['subnet_id'] for nat_gw in nat_gateways] if not subnet_ids: return { 'status': 'success', 'redundant_groups': [], 'count': 0, 'message': "No NAT Gateways found" } subnets_response = ec2.describe_subnets(SubnetIds=subnet_ids) subnet_az_map = { subnet['SubnetId']: subnet['AvailabilityZone'] for subnet in subnets_response['Subnets'] } # Group NAT Gateways by VPC and AZ vpc_az_groups = {} for nat_gw in nat_gateways: vpc_id = nat_gw['vpc_id'] subnet_id = nat_gw['subnet_id'] az = subnet_az_map.get(subnet_id, 'unknown') key = f"{vpc_id}:{az}" if key not in vpc_az_groups: vpc_az_groups[key] = [] vpc_az_groups[key].append(nat_gw) # Find groups with multiple NAT Gateways redundant_groups = [] for key, nat_gws in vpc_az_groups.items(): if len(nat_gws) > 1: vpc_id, az = key.split(':') # Get metrics for each NAT Gateway in the group nat_gws_with_metrics = [] for nat_gw in nat_gws: metrics = get_nat_gateway_metrics(nat_gw['nat_gateway_id'], region) route_tables = get_route_tables_using_nat_gateway(nat_gw['nat_gateway_id'], region) nat_gw_analysis = nat_gw.copy() nat_gw_analysis.update({ 'total_data_transfer_gb': round( (metrics.get('BytesInFromSource', {}).get('value', 0) + metrics.get('BytesOutToDestination', {}).get('value', 0)) / (1024**3), 4 ), 'active_connections_avg': metrics.get('ActiveConnectionCount', {}).get('value', 0), 'route_tables_count': len(route_tables), 'route_tables': route_tables }) nat_gws_with_metrics.append(nat_gw_analysis) # Sort by usage (data transfer) nat_gws_with_metrics.sort(key=lambda x: x['total_data_transfer_gb'], reverse=True) redundant_groups.append({ 'vpc_id': vpc_id, 'availability_zone': az, 'nat_gateways': nat_gws_with_metrics, 'count': len(nat_gws_with_metrics), 'recommendation': f"Consider consolidating {len(nat_gws_with_metrics)} NAT Gateways in {az}", 'potential_monthly_savings': (len(nat_gws_with_metrics) - 1) * get_nat_gateway_monthly_cost(region) # Keep one, remove others }) total_potential_savings = sum(group['potential_monthly_savings'] for group in redundant_groups) result = { 'status': 'success', 'region': region, 'redundant_groups': redundant_groups, 'count': len(redundant_groups), 'total_potential_monthly_savings': round(total_potential_savings, 2), 'message': f"Found {len(redundant_groups)} groups with potentially redundant NAT Gateways" } # Add account context if requested if include_account_context and account_context: result['account_context'] = account_context result['message'] = f"{result['message']} | Account has {account_context.get('total_nat_gateways_in_account', 0)} total NAT Gateways" return result except Exception as e: logger.error(f"Error analyzing redundant NAT Gateways: {str(e)}") return { 'status': 'error', 'message': f"Error analyzing redundant NAT Gateways: {str(e)}", 'redundant_groups': [], 'count': 0, 'total_potential_monthly_savings': 0 } # Cross-AZ traffic analysis removed - speculative savings based on fictional cost estimates # Would require VPC Flow Logs analysis to provide meaningful recommendations # VPC Endpoint analysis removed - requires actual traffic analysis to provide meaningful recommendations # Without VPC Flow Logs or detailed CloudWatch analysis, VPC endpoint savings are speculative def analyze_unused_nat_gateways(region: str = 'us-east-1', include_account_context: bool = True) -> Dict[str, Any]: """ Identify NAT Gateways that are not referenced by any route tables. Args: region: AWS region to analyze Returns: Analysis results with unused NAT Gateways """ try: # Handle None region if region is None: region = 'us-east-1' # Get account-wide context if requested account_context = {} if include_account_context: logger.info("Getting account-wide NAT Gateway count for context") account_nat_gw_count = count_all_nat_gateways(region) account_context = { 'total_nat_gateways_in_account': account_nat_gw_count.get('total_count', 0), 'regions_with_nat_gateways': account_nat_gw_count.get('regions_with_nat_gateways', 0), 'regional_breakdown': account_nat_gw_count.get('regional_breakdown', {}), 'permission_limited': account_nat_gw_count.get('permission_limited', False), 'scanned_regions': account_nat_gw_count.get('scanned_regions', 0), 'context_message': account_nat_gw_count.get('message', 'Unable to determine account context') } nat_gateways = get_nat_gateways(region) unused_nat_gateways = [] for nat_gw in nat_gateways: nat_gateway_id = nat_gw['nat_gateway_id'] route_tables = get_route_tables_using_nat_gateway(nat_gateway_id, region) if not route_tables: # Get metrics to confirm no usage metrics = get_nat_gateway_metrics(nat_gateway_id, region) total_bytes = (metrics.get('BytesInFromSource', {}).get('value', 0) + metrics.get('BytesOutToDestination', {}).get('value', 0)) unused_nat_gateways.append({ 'nat_gateway_id': nat_gateway_id, 'vpc_id': nat_gw['vpc_id'], 'subnet_id': nat_gw['subnet_id'], 'state': nat_gw['state'], 'connectivity_type': nat_gw['connectivity_type'], 'created_time': nat_gw['created_time'], 'tags': nat_gw['tags'], 'route_tables_count': 0, 'total_data_transfer_bytes': total_bytes, 'estimated_monthly_cost': get_nat_gateway_monthly_cost(region), 'potential_monthly_savings': get_nat_gateway_monthly_cost(region), 'recommendation': 'Safe to delete - not used by any route tables' }) total_potential_savings = sum(nat['potential_monthly_savings'] for nat in unused_nat_gateways) result = { 'status': 'success', 'region': region, 'unused_nat_gateways': unused_nat_gateways, 'count': len(unused_nat_gateways), 'total_potential_monthly_savings': round(total_potential_savings, 2), 'message': f"Found {len(unused_nat_gateways)} unused NAT Gateways with potential savings of ${total_potential_savings:.2f}/month" } # Add account context if requested if include_account_context and account_context: result['account_context'] = account_context result['message'] = f"{result['message']} | Account has {account_context.get('total_nat_gateways_in_account', 0)} total NAT Gateways" return result except Exception as e: logger.error(f"Error analyzing unused NAT Gateways: {str(e)}") return { 'status': 'error', 'message': f"Error analyzing unused NAT Gateways: {str(e)}", 'unused_nat_gateways': [], 'count': 0, 'total_potential_monthly_savings': 0 } # Secondary IP consolidation analysis removed - too complex for minimal value # Private NAT Gateway analysis removed - too complex for minimal value def generate_nat_gateway_optimization_report(region: str = 'us-east-1', data_transfer_threshold_gb: float = 1.0, lookback_days: int = 14, output_format: str = 'json') -> Dict[str, Any]: """ Generate a comprehensive NAT Gateway optimization report. Args: region: AWS region to analyze data_transfer_threshold_gb: Minimum GB threshold for utilization lookback_days: Number of days to analyze output_format: Output format ('json' or 'markdown') Returns: Comprehensive optimization report """ try: # First, get account-wide NAT Gateway count for context logger.info("Getting account-wide NAT Gateway count for context verification") account_nat_gw_count = count_all_nat_gateways(region) # Get all NAT Gateways once all_nat_gateways = get_nat_gateways(region) all_nat_gateway_ids = {nat['nat_gateway_id'] for nat in all_nat_gateways} # Track which NAT Gateways have been optimized (to exclude from subsequent checks) optimized_nat_gateway_ids = set() nat_gateway_savings = {} optimization_applied = {} # PRIORITY 1: Unused NAT Gateways (highest priority - complete removal) logger.info("Step 1: Checking for unused NAT Gateways...") unused_analysis = analyze_unused_nat_gateways(region, include_account_context=False) for nat_gw in unused_analysis.get('unused_nat_gateways', []): nat_id = nat_gw['nat_gateway_id'] nat_gateway_savings[nat_id] = nat_gw.get('potential_monthly_savings', 0) optimization_applied[nat_id] = 'unused_deletion' optimized_nat_gateway_ids.add(nat_id) logger.info(f"Found {len(optimized_nat_gateway_ids)} unused NAT Gateways - excluding from further analysis") # PRIORITY 2: Underutilized NAT Gateways (only check remaining NAT Gateways) remaining_nat_gateways = all_nat_gateway_ids - optimized_nat_gateway_ids if remaining_nat_gateways: logger.info(f"Step 2: Checking {len(remaining_nat_gateways)} remaining NAT Gateways for underutilization...") underutilized_analysis = analyze_underutilized_nat_gateways( region, data_transfer_threshold_gb, lookback_days, use_trusted_advisor=True, include_account_context=False ) for nat_gw in underutilized_analysis.get('underutilized_nat_gateways', []): nat_id = nat_gw['nat_gateway_id'] if nat_id not in optimized_nat_gateway_ids: # Double-check exclusion nat_gateway_savings[nat_id] = nat_gw.get('potential_monthly_savings', 0) optimization_applied[nat_id] = 'underutilized_removal' optimized_nat_gateway_ids.add(nat_id) else: logger.info("Step 2: Skipping underutilization check - all NAT Gateways already flagged as unused") underutilized_analysis = {'count': 0, 'underutilized_nat_gateways': [], 'total_potential_monthly_savings': 0} # PRIORITY 3: Redundant NAT Gateways (only check remaining NAT Gateways) remaining_nat_gateways = all_nat_gateway_ids - optimized_nat_gateway_ids if remaining_nat_gateways: logger.info(f"Step 3: Checking {len(remaining_nat_gateways)} remaining NAT Gateways for redundancy...") redundant_analysis = analyze_redundant_nat_gateways(region, include_account_context=False) for group in redundant_analysis.get('redundant_groups', []): for nat_gw in group.get('nat_gateways', []): nat_id = nat_gw['nat_gateway_id'] if nat_id not in optimized_nat_gateway_ids: # Double-check exclusion group_savings = group.get('potential_monthly_savings', 0) per_gateway_savings = group_savings / max(1, len(group.get('nat_gateways', []))) nat_gateway_savings[nat_id] = per_gateway_savings optimization_applied[nat_id] = 'redundancy_consolidation' optimized_nat_gateway_ids.add(nat_id) else: logger.info("Step 3: Skipping redundancy check - all NAT Gateways already optimized") redundant_analysis = {'count': 0, 'redundant_groups': [], 'total_potential_monthly_savings': 0} # Get dedicated Trusted Advisor findings (for reporting purposes) trusted_advisor_analysis = get_idle_nat_gateways_from_trusted_advisor() # Deduplication already done upfront - each check excludes previously optimized NAT Gateways # VPC Endpoint analysis removed - too complex and speculative without actual traffic data # Calculate total savings (already deduplicated through sequential exclusion) nat_gateway_total_savings = sum(nat_gateway_savings.values()) total_savings = nat_gateway_total_savings logger.info(f"Optimization complete: {len(optimized_nat_gateway_ids)} NAT Gateways optimized, ${total_savings:.2f}/month savings") # Create optimization summary optimization_summary = { 'total_nat_gateways_in_region': len(all_nat_gateway_ids), 'total_nat_gateways_optimized': len(optimized_nat_gateway_ids), 'optimizations_applied': optimization_applied, 'nat_gateway_savings_breakdown': nat_gateway_savings, 'total_monthly_savings': round(nat_gateway_total_savings, 2), 'optimization_method': 'sequential_exclusion', 'logic_explanation': 'Each check excludes NAT Gateways already optimized by higher-priority checks (unused > underutilized > redundant)', 'efficiency_note': 'No duplicate analysis - each NAT Gateway analyzed only once' } report = { 'status': 'success', 'report_type': 'NAT Gateway Comprehensive Optimization Report', 'region': region, 'analysis_date': datetime.utcnow().isoformat(), 'account_context': { 'total_nat_gateways_in_account': account_nat_gw_count.get('total_count', 0), 'regions_with_nat_gateways': account_nat_gw_count.get('regions_with_nat_gateways', 0), 'regional_breakdown': account_nat_gw_count.get('regional_breakdown', {}), 'context_message': account_nat_gw_count.get('message', 'Unable to determine account context'), 'analysis_scope': f"Analyzing region '{region}' out of {account_nat_gw_count.get('total_count', 0)} total NAT Gateways" }, 'analysis_parameters': { 'data_transfer_threshold_gb': data_transfer_threshold_gb, 'lookback_days': lookback_days }, 'trusted_advisor_analysis': trusted_advisor_analysis, 'underutilized_analysis': underutilized_analysis, 'redundant_analysis': redundant_analysis, 'unused_analysis': unused_analysis, 'data_sources': { 'trusted_advisor_available': trusted_advisor_analysis.get('status') == 'success', 'cloudwatch_metrics': underutilized_analysis.get('data_sources', []), 'primary_source': 'trusted_advisor' if trusted_advisor_analysis.get('status') == 'success' else 'cloudwatch_metrics' }, 'optimization_summary': optimization_summary, 'summary': { 'total_underutilized_nat_gateways': underutilized_analysis.get('count', 0), 'total_redundant_groups': redundant_analysis.get('count', 0), 'total_unused_nat_gateways': unused_analysis.get('count', 0), 'trusted_advisor_idle_nat_gateways': trusted_advisor_analysis.get('count', 0), 'total_potential_monthly_savings': round(total_savings, 2), 'savings_calculation_method': 'deduplicated_smart_optimization' }, 'recommendations': [ { 'priority': 'critical', 'category': 'unused_nat_gateways', 'description': f'Remove {unused_analysis.get("count", 0)} unused NAT Gateways (highest priority - complete removal)', 'potential_savings': sum(savings for nat_id, savings in nat_gateway_savings.items() if optimization_applied.get(nat_id) == 'unused_deletion'), 'confidence': 'high', 'source': 'route_table_analysis', 'implementation_complexity': 'low', 'affected_resources': [nat_id for nat_id, opt in optimization_applied.items() if opt == 'unused_deletion'] }, { 'priority': 'high', 'category': 'underutilized_nat_gateways', 'description': f'Remove {len([1 for opt in optimization_applied.values() if opt == "underutilized_removal"])} underutilized NAT Gateways (low traffic)', 'potential_savings': sum(savings for nat_id, savings in nat_gateway_savings.items() if optimization_applied.get(nat_id) == 'underutilized_removal'), 'confidence': 'medium', 'source': 'cloudwatch_metrics', 'implementation_complexity': 'low', 'affected_resources': [nat_id for nat_id, opt in optimization_applied.items() if opt == 'underutilized_removal'] }, { 'priority': 'medium', 'category': 'redundant_nat_gateways', 'description': f'Consolidate {len([1 for opt in optimization_applied.values() if opt == "redundancy_consolidation"])} redundant NAT Gateways', 'potential_savings': sum(savings for nat_id, savings in nat_gateway_savings.items() if optimization_applied.get(nat_id) == 'redundancy_consolidation'), 'confidence': 'medium', 'source': 'availability_zone_analysis', 'implementation_complexity': 'medium', 'affected_resources': [nat_id for nat_id, opt in optimization_applied.items() if opt == 'redundancy_consolidation'] }, ] } # Add smart_recommendations as an alias for backward compatibility report['smart_recommendations'] = report['recommendations'] return report except Exception as e: logger.error(f"Error generating NAT Gateway optimization report: {str(e)}") return { 'status': 'error', 'message': f"Error generating report: {str(e)}", 'total_potential_monthly_savings': 0 } def get_idle_nat_gateways_from_trusted_advisor() -> Dict[str, Any]: """ Get idle NAT Gateway findings specifically from Trusted Advisor. This is a dedicated function that focuses only on Trusted Advisor results. Returns: Dictionary containing only Trusted Advisor idle NAT Gateway findings """ try: logger.info("Querying Trusted Advisor for idle NAT Gateway findings") # Use the existing function but return in a cleaner format ta_result = get_trusted_advisor_idle_nat_gateways() if ta_result['status'] != 'success': return ta_result # Enhance the results with additional context enhanced_findings = [] for nat_gw in ta_result['idle_nat_gateways']: # Try to get additional details for each NAT Gateway try: ec2 = boto3.client('ec2', region_name=nat_gw['region']) nat_details = ec2.describe_nat_gateways( NatGatewayIds=[nat_gw['nat_gateway_id']] ) if nat_details['NatGateways']: nat_detail = nat_details['NatGateways'][0] enhanced_finding = nat_gw.copy() enhanced_finding.update({ 'subnet_id': nat_detail.get('SubnetId'), 'state': nat_detail.get('State'), 'connectivity_type': nat_detail.get('ConnectivityType', 'public'), 'created_time': nat_detail.get('CreateTime'), 'tags': {tag['Key']: tag['Value'] for tag in nat_detail.get('Tags', [])}, 'addresses': nat_detail.get('NatGatewayAddresses', []) }) enhanced_findings.append(enhanced_finding) else: # NAT Gateway might have been deleted, keep original data enhanced_findings.append(nat_gw) except Exception as detail_error: logger.warning(f"Could not get details for NAT Gateway {nat_gw['nat_gateway_id']}: {str(detail_error)}") # Keep the original Trusted Advisor data enhanced_findings.append(nat_gw) return { 'status': 'success', 'source': 'trusted_advisor', 'check_name': ta_result.get('check_name', 'Idle NAT Gateways'), 'check_status': ta_result.get('check_status', 'unknown'), 'idle_nat_gateways': enhanced_findings, 'count': len(enhanced_findings), 'total_potential_monthly_savings': ta_result.get('total_potential_monthly_savings', 0), 'message': f"Trusted Advisor identified {len(enhanced_findings)} idle NAT Gateways", 'recommendation': 'Review and consider removing idle NAT Gateways identified by AWS Trusted Advisor', 'confidence_level': 'high', 'data_freshness': 'Trusted Advisor data is refreshed every 24 hours' } except Exception as e: logger.error(f"Error getting Trusted Advisor idle NAT Gateways: {str(e)}") return { 'status': 'error', 'message': f"Error accessing Trusted Advisor: {str(e)}", 'idle_nat_gateways': [], 'count': 0, 'total_potential_monthly_savings': 0 }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aws-samples/sample-cfm-tips-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

nat_gateway_optimization.py•50.6 KiB