/**
* GCP Monitoring and Observability Module
*
* Provides:
* - Cloud Monitoring dashboards
* - Alerting policies
* - Log-based metrics
* - Uptime checks
* - SLO monitoring
* - Notification channels
*/
terraform {
required_providers {
google = {
source = "hashicorp/google"
version = "~> 5.0"
}
}
}
# Local variables
locals {
service_name = "mcp-bigquery-server"
# Common labels for all monitoring resources
common_labels = merge(
var.labels,
{
environment = var.environment
service = local.service_name
managed_by = "terraform"
}
)
}
# ==========================================
# Notification Channels
# ==========================================
# Email notification channel
resource "google_monitoring_notification_channel" "email" {
display_name = "Email - ${var.environment}"
type = "email"
labels = {
email_address = var.alert_email
}
enabled = true
}
# Slack notification channel (optional)
resource "google_monitoring_notification_channel" "slack" {
count = var.slack_webhook_url != "" ? 1 : 0
display_name = "Slack - ${var.environment}"
type = "slack"
labels = {
url = var.slack_webhook_url
}
enabled = true
}
# PagerDuty notification channel (optional)
resource "google_monitoring_notification_channel" "pagerduty" {
count = var.pagerduty_service_key != "" ? 1 : 0
display_name = "PagerDuty - ${var.environment}"
type = "pagerduty"
sensitive_labels {
service_key = var.pagerduty_service_key
}
enabled = true
}
# ==========================================
# Log-Based Metrics
# ==========================================
# Error count metric
resource "google_logging_metric" "error_count" {
name = "mcp_bigquery_error_count_${var.environment}"
filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
severity>=ERROR
EOT
metric_descriptor {
metric_kind = "DELTA"
value_type = "INT64"
unit = "1"
labels {
key = "severity"
value_type = "STRING"
description = "Log severity level"
}
}
label_extractors = {
"severity" = "EXTRACT(severity)"
}
}
# Query latency metric
resource "google_logging_metric" "query_latency" {
name = "mcp_bigquery_query_latency_${var.environment}"
filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
jsonPayload.message=~"Query completed"
EOT
metric_descriptor {
metric_kind = "DELTA"
value_type = "DISTRIBUTION"
unit = "ms"
labels {
key = "query_type"
value_type = "STRING"
description = "Type of BigQuery operation"
}
}
value_extractor = "EXTRACT(jsonPayload.duration)"
label_extractors = {
"query_type" = "EXTRACT(jsonPayload.operation)"
}
bucket_options {
exponential_buckets {
num_finite_buckets = 64
growth_factor = 2
scale = 1
}
}
}
# Authentication failure metric
resource "google_logging_metric" "auth_failures" {
name = "mcp_bigquery_auth_failures_${var.environment}"
filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
jsonPayload.message=~"authentication failed|token verification failed"
EOT
metric_descriptor {
metric_kind = "DELTA"
value_type = "INT64"
unit = "1"
}
}
# BigQuery bytes processed metric
resource "google_logging_metric" "bigquery_bytes_processed" {
name = "mcp_bigquery_bytes_processed_${var.environment}"
filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
jsonPayload.bytesProcessed:*
EOT
metric_descriptor {
metric_kind = "DELTA"
value_type = "INT64"
unit = "By"
}
value_extractor = "EXTRACT(jsonPayload.bytesProcessed)"
}
# ==========================================
# Alerting Policies
# ==========================================
# High error rate alert
resource "google_monitoring_alert_policy" "high_error_rate" {
display_name = "MCP BigQuery - High Error Rate (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Error rate > ${var.error_rate_threshold}%"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND metric.type=\"logging.googleapis.com/user/mcp_bigquery_error_count_${var.environment}\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.error_rate_threshold
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_RATE"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.slack_webhook_url != "" ? [google_monitoring_notification_channel.slack[0].id] : [],
var.environment == "production" && var.pagerduty_service_key != "" ? [google_monitoring_notification_channel.pagerduty[0].id] : []
)
alert_strategy {
auto_close = "86400s" # 24 hours
}
documentation {
content = <<-EOT
## High Error Rate Alert
The MCP BigQuery server is experiencing elevated error rates.
**Environment**: ${var.environment}
**Threshold**: ${var.error_rate_threshold} errors/minute
### Investigation Steps:
1. Check Cloud Logging for recent errors
2. Verify Workload Identity Federation configuration
3. Check BigQuery API quotas and limits
4. Review recent deployments or configuration changes
### Quick Links:
- [Cloud Run Service](https://console.cloud.google.com/run/detail/${var.cloud_run_location}/${local.service_name})
- [Cloud Logging](https://console.cloud.google.com/logs/query)
- [Error Reporting](https://console.cloud.google.com/errors)
EOT
mime_type = "text/markdown"
}
}
# High latency alert
resource "google_monitoring_alert_policy" "high_latency" {
display_name = "MCP BigQuery - High Query Latency (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "P95 latency > ${var.latency_threshold_ms}ms"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND metric.type=\"logging.googleapis.com/user/mcp_bigquery_query_latency_${var.environment}\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.latency_threshold_ms
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_DELTA"
cross_series_reducer = "REDUCE_PERCENTILE_95"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.slack_webhook_url != "" ? [google_monitoring_notification_channel.slack[0].id] : []
)
alert_strategy {
auto_close = "86400s"
}
documentation {
content = <<-EOT
## High Query Latency Alert
BigQuery queries are taking longer than expected.
**Environment**: ${var.environment}
**Threshold**: ${var.latency_threshold_ms}ms (P95)
### Investigation Steps:
1. Check BigQuery query performance in console
2. Review query complexity and data volume
3. Check for BigQuery slot contention
4. Verify network connectivity to BigQuery
### Quick Links:
- [BigQuery Console](https://console.cloud.google.com/bigquery)
- [Cloud Monitoring](https://console.cloud.google.com/monitoring)
EOT
mime_type = "text/markdown"
}
}
# Authentication failure alert
resource "google_monitoring_alert_policy" "auth_failures" {
display_name = "MCP BigQuery - Authentication Failures (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Auth failures > ${var.auth_failure_threshold}"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND metric.type=\"logging.googleapis.com/user/mcp_bigquery_auth_failures_${var.environment}\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.auth_failure_threshold
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_RATE"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.environment == "production" && var.pagerduty_service_key != "" ? [google_monitoring_notification_channel.pagerduty[0].id] : []
)
alert_strategy {
auto_close = "43200s" # 12 hours
}
documentation {
content = <<-EOT
## Authentication Failures Alert
Multiple authentication failures detected.
**Environment**: ${var.environment}
**Threshold**: ${var.auth_failure_threshold} failures/minute
### Security Investigation:
1. Check Cloud Logging for failed authentication attempts
2. Verify Workload Identity Federation configuration
3. Review Google Workspace OIDC settings
4. Check for potential security incidents
5. Verify service account permissions
### Quick Links:
- [IAM & Admin](https://console.cloud.google.com/iam-admin)
- [Security Command Center](https://console.cloud.google.com/security)
EOT
mime_type = "text/markdown"
}
}
# Cloud Run instance count alert (scaling issues)
resource "google_monitoring_alert_policy" "instance_count" {
display_name = "MCP BigQuery - High Instance Count (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Instance count > ${var.max_instance_threshold}"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND metric.type=\"run.googleapis.com/container/instance_count\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.max_instance_threshold
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_MAX"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "86400s"
}
documentation {
content = <<-EOT
## High Instance Count Alert
Cloud Run is scaling beyond expected capacity.
**Threshold**: ${var.max_instance_threshold} instances
### Investigation:
1. Check for traffic spikes or DDoS
2. Review application performance
3. Consider adjusting max instances or resources
EOT
mime_type = "text/markdown"
}
}
# Memory utilization alert
resource "google_monitoring_alert_policy" "memory_utilization" {
display_name = "MCP BigQuery - High Memory Usage (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Memory utilization > ${var.memory_threshold_percent}%"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND metric.type=\"run.googleapis.com/container/memory/utilizations\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.memory_threshold_percent / 100
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_MEAN"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "86400s"
}
}
# ==========================================
# Security Alert Policies
# ==========================================
# Prompt injection detection alert
resource "google_monitoring_alert_policy" "prompt_injection" {
display_name = "MCP BigQuery - Prompt Injection Detected (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Prompt injection attempts detected"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND jsonPayload.securityEvent=true AND jsonPayload.type=\"prompt_injection\""
duration = "60s"
comparison = "COMPARISON_GT"
threshold_value = 0
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_COUNT"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.slack_webhook_url != "" ? [google_monitoring_notification_channel.slack[0].id] : [],
var.environment == "production" && var.pagerduty_service_key != "" ? [google_monitoring_notification_channel.pagerduty[0].id] : []
)
alert_strategy {
auto_close = "3600s" # 1 hour
}
documentation {
content = <<-EOT
## Security Alert: Prompt Injection Detected
The security middleware has detected potential prompt injection attacks.
**Environment**: ${var.environment}
**Severity**: CRITICAL
### Immediate Actions:
1. Review security logs for attack patterns
2. Identify source IP addresses
3. Consider blocking malicious IPs in Cloud Armor
4. Review recent query patterns
### Investigation:
```
gcloud logging read "jsonPayload.type=\"prompt_injection\"" --limit 50
```
### Quick Links:
- [Security Logs](https://console.cloud.google.com/logs/query?query=jsonPayload.type%3D%22prompt_injection%22)
- [Cloud Armor](https://console.cloud.google.com/net-security/securitypolicies)
EOT
mime_type = "text/markdown"
}
}
# Rate limit exceeded alert
resource "google_monitoring_alert_policy" "rate_limit_exceeded" {
display_name = "MCP BigQuery - Rate Limit Exceeded (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Multiple rate limit violations"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND jsonPayload.securityEvent=true AND jsonPayload.type=\"rate_limit_exceeded\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.environment == "production" ? 10 : 50
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_COUNT"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.slack_webhook_url != "" ? [google_monitoring_notification_channel.slack[0].id] : []
)
alert_strategy {
auto_close = "7200s" # 2 hours
}
documentation {
content = <<-EOT
## Security Alert: Rate Limit Exceeded
Multiple rate limit violations detected - possible abuse or DDoS attempt.
**Environment**: ${var.environment}
**Threshold**: ${var.environment == "production" ? 10 : 50} violations in 5 minutes
### Investigation Steps:
1. Identify source IPs with highest violation counts
2. Check if legitimate traffic spike or attack
3. Review Cloud Armor rate limiting effectiveness
4. Consider adjusting rate limits if legitimate traffic
### Quick Links:
- [Security Logs](https://console.cloud.google.com/logs/query?query=jsonPayload.type%3D%22rate_limit_exceeded%22)
- [Cloud Armor Metrics](https://console.cloud.google.com/monitoring)
EOT
mime_type = "text/markdown"
}
}
# Unauthorized tool access alert
resource "google_monitoring_alert_policy" "unauthorized_tool" {
display_name = "MCP BigQuery - Unauthorized Tool Access (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Unauthorized tool access attempts"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND jsonPayload.securityEvent=true AND jsonPayload.type=\"unauthorized_tool\""
duration = "60s"
comparison = "COMPARISON_GT"
threshold_value = 0
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_COUNT"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.slack_webhook_url != "" ? [google_monitoring_notification_channel.slack[0].id] : []
)
alert_strategy {
auto_close = "3600s"
}
documentation {
content = <<-EOT
## Security Alert: Unauthorized Tool Access
Attempts to access unauthorized MCP tools detected.
**Environment**: ${var.environment}
**Severity**: HIGH
### Investigation:
1. Review which tools were requested
2. Check if tool whitelist needs updating
3. Verify client configuration
4. Look for signs of compromised credentials
### Quick Links:
- [Security Logs](https://console.cloud.google.com/logs/query?query=jsonPayload.type%3D%22unauthorized_tool%22)
EOT
mime_type = "text/markdown"
}
}
# Sensitive data detected alert
resource "google_monitoring_alert_policy" "sensitive_data" {
display_name = "MCP BigQuery - Sensitive Data Detected (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Sensitive data in responses"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND jsonPayload.securityEvent=true AND jsonPayload.type=\"sensitive_data_detected\""
duration = "300s"
comparison = "COMPARISON_GT"
threshold_value = var.environment == "production" ? 5 : 20
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_COUNT"
}
}
}
notification_channels = [google_monitoring_notification_channel.email.id]
alert_strategy {
auto_close = "86400s"
}
documentation {
content = <<-EOT
## Security Alert: Sensitive Data Detected
Multiple responses contained sensitive data that was redacted.
**Environment**: ${var.environment}
**Threshold**: ${var.environment == "production" ? 5 : 20} detections in 5 minutes
### Investigation:
1. Review which queries are returning sensitive data
2. Check if data classification is correct
3. Verify BigQuery dataset permissions
4. Consider additional column-level security
### Quick Links:
- [Security Logs](https://console.cloud.google.com/logs/query?query=jsonPayload.type%3D%22sensitive_data_detected%22)
- [BigQuery IAM](https://console.cloud.google.com/iam-admin/iam)
EOT
mime_type = "text/markdown"
}
}
# Tool description change alert (rug pull prevention)
resource "google_monitoring_alert_policy" "tool_description_changed" {
display_name = "MCP BigQuery - Tool Description Changed (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Tool description modification detected"
condition_threshold {
filter = "resource.type=\"cloud_run_revision\" AND resource.labels.service_name=\"${local.service_name}\" AND jsonPayload.message=\"Tool description changed\""
duration = "60s"
comparison = "COMPARISON_GT"
threshold_value = 0
aggregations {
alignment_period = "60s"
per_series_aligner = "ALIGN_COUNT"
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.slack_webhook_url != "" ? [google_monitoring_notification_channel.slack[0].id] : [],
var.environment == "production" && var.pagerduty_service_key != "" ? [google_monitoring_notification_channel.pagerduty[0].id] : []
)
alert_strategy {
auto_close = "3600s"
}
documentation {
content = <<-EOT
## Security Alert: Tool Description Changed (Rug Pull Prevention)
A tool's description has been modified - possible rug pull attack.
**Environment**: ${var.environment}
**Severity**: CRITICAL
### Immediate Actions:
1. Verify if change was authorized
2. Check recent deployments and code changes
3. Review tool registration process
4. Investigate for supply chain compromise
### Prevention:
- All tool changes should go through code review
- Implement signed deployments
- Enable Binary Authorization
### Quick Links:
- [Deployment History](https://console.cloud.google.com/run/detail/${var.cloud_run_location}/${local.service_name}/revisions)
- [Audit Logs](https://console.cloud.google.com/logs/query?query=protoPayload.serviceName%3D%22run.googleapis.com%22)
EOT
mime_type = "text/markdown"
}
}
# ==========================================
# Uptime Checks
# ==========================================
resource "google_monitoring_uptime_check_config" "health_check" {
count = var.cloud_run_url != "" ? 1 : 0
display_name = "MCP BigQuery Health Check - ${var.environment}"
timeout = "10s"
period = "60s"
http_check {
path = "/health"
port = 443
use_ssl = true
validate_ssl = true
request_method = "GET"
accepted_response_status_codes {
status_class = "STATUS_CLASS_2XX"
}
}
monitored_resource {
type = "uptime_url"
labels = {
project_id = var.project_id
host = replace(var.cloud_run_url, "https://", "")
}
}
content_matchers {
content = "healthy"
matcher = "CONTAINS_STRING"
}
}
# Uptime check alert
resource "google_monitoring_alert_policy" "uptime_check" {
count = var.cloud_run_url != "" ? 1 : 0
display_name = "MCP BigQuery - Service Down (${var.environment})"
combiner = "OR"
enabled = true
conditions {
display_name = "Uptime check failed"
condition_threshold {
filter = "resource.type=\"uptime_url\" AND metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.label.check_id=\"${google_monitoring_uptime_check_config.health_check[0].uptime_check_id}\""
duration = "300s"
comparison = "COMPARISON_LT"
threshold_value = 1
aggregations {
alignment_period = "60s"
cross_series_reducer = "REDUCE_COUNT_FALSE"
per_series_aligner = "ALIGN_NEXT_OLDER"
group_by_fields = ["resource.label.host"]
}
}
}
notification_channels = concat(
[google_monitoring_notification_channel.email.id],
var.environment == "production" && var.pagerduty_service_key != "" ? [google_monitoring_notification_channel.pagerduty[0].id] : []
)
alert_strategy {
auto_close = "3600s" # 1 hour
}
documentation {
content = <<-EOT
## Service Down Alert
The MCP BigQuery service is not responding to health checks.
**Environment**: ${var.environment}
### Immediate Actions:
1. Check Cloud Run service status
2. Review recent deployments
3. Check Cloud Run logs for errors
4. Verify Workload Identity configuration
### Quick Links:
- [Cloud Run Service](https://console.cloud.google.com/run)
- [Cloud Logging](https://console.cloud.google.com/logs)
EOT
mime_type = "text/markdown"
}
}
# ==========================================
# SLO Configuration
# ==========================================
# Availability SLO (99.9% uptime)
resource "google_monitoring_slo" "availability" {
service = google_monitoring_custom_service.mcp_bigquery.service_id
display_name = "Availability SLO - ${var.environment}"
goal = var.availability_slo_target
rolling_period_days = 30
request_based_sli {
good_total_ratio {
total_service_filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
metric.type="run.googleapis.com/request_count"
EOT
good_service_filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
metric.type="run.googleapis.com/request_count"
metric.labels.response_code_class="2xx"
EOT
}
}
}
# Latency SLO (95% of requests < threshold)
resource "google_monitoring_slo" "latency" {
service = google_monitoring_custom_service.mcp_bigquery.service_id
display_name = "Latency SLO - ${var.environment}"
goal = var.latency_slo_target
rolling_period_days = 30
request_based_sli {
distribution_cut {
distribution_filter = <<-EOT
resource.type="cloud_run_revision"
resource.labels.service_name="${local.service_name}"
metric.type="run.googleapis.com/request_latencies"
EOT
range {
max = var.latency_threshold_ms / 1000 # Convert to seconds
}
}
}
}
# Custom service for SLO tracking
resource "google_monitoring_custom_service" "mcp_bigquery" {
service_id = "mcp-bigquery-${var.environment}"
display_name = "MCP BigQuery Server - ${var.environment}"
}
# ==========================================
# Dashboard
# ==========================================
resource "google_monitoring_dashboard" "mcp_bigquery" {
dashboard_json = templatefile("${path.module}/dashboard.json.tpl", {
project_id = var.project_id
environment = var.environment
service_name = local.service_name
display_name = "MCP BigQuery Server - ${var.environment}"
})
}