#!/usr/bin/env python3
"""Generate synthetic data for Cloud SQL database."""
import os
import sys
from datetime import datetime, timedelta
from decimal import Decimal
import random
from faker import Faker
import psycopg2
from psycopg2.extras import execute_values
# Configuration
NUM_CUSTOMERS = 50
NUM_VENDORS = 50
NUM_ORDERS = 50
# Order statuses
ORDER_STATUSES = ['pending', 'processing', 'shipped', 'delivered', 'cancelled']
# Sample products
PRODUCTS = [
'Laptop', 'Desktop Computer', 'Monitor', 'Keyboard', 'Mouse',
'Headphones', 'Webcam', 'USB Cable', 'External Hard Drive', 'SSD',
'RAM Module', 'Graphics Card', 'Motherboard', 'Power Supply', 'CPU',
'Phone Case', 'Charger', 'Screen Protector', 'Wireless Earbuds', 'Smartwatch'
]
def get_db_connection():
"""Create database connection."""
return psycopg2.connect(
host=os.getenv('DB_HOST'),
database=os.getenv('CLOUDSQL_DATABASE', 'salesdb'),
user=os.getenv('CLOUDSQL_USER', 'salesuser'),
password=os.getenv('CLOUDSQL_PASSWORD')
)
def generate_customers(fake, num_customers):
"""Generate synthetic customer data."""
customers = []
for _ in range(num_customers):
customers.append((
fake.first_name(),
fake.last_name(),
fake.unique.email(),
fake.phone_number()[:20],
fake.street_address(),
fake.city(),
fake.state_abbr(),
fake.zipcode(),
'USA'
))
return customers
def generate_vendors(fake, num_vendors):
"""Generate synthetic vendor data."""
vendors = []
for _ in range(num_vendors):
vendors.append((
fake.company(),
fake.name(),
fake.company_email(),
fake.phone_number()[:20],
fake.street_address(),
fake.city(),
fake.state_abbr(),
fake.zipcode(),
'USA'
))
return vendors
def generate_orders(fake, num_orders, customer_ids, vendor_ids):
"""Generate synthetic order data."""
orders = []
for _ in range(num_orders):
quantity = random.randint(1, 10)
unit_price = Decimal(random.uniform(10.0, 999.99)).quantize(Decimal('0.01'))
total_amount = quantity * unit_price
order_date = fake.date_time_between(start_date='-1y', end_date='now')
ship_date = None
status = random.choice(ORDER_STATUSES)
if status in ['shipped', 'delivered']:
ship_date = order_date + timedelta(days=random.randint(1, 7))
orders.append((
random.choice(customer_ids),
random.choice(vendor_ids),
order_date,
ship_date,
total_amount,
status,
random.choice(PRODUCTS),
quantity,
unit_price
))
return orders
def seed_database():
"""Seed the database with synthetic data."""
print("Connecting to database...")
conn = get_db_connection()
cur = conn.cursor()
try:
fake = Faker()
Faker.seed(42) # For reproducibility
# Generate customers
print(f"Generating {NUM_CUSTOMERS} customers...")
customers = generate_customers(fake, NUM_CUSTOMERS)
execute_values(
cur,
"""INSERT INTO customers
(first_name, last_name, email, phone, address, city, state, zip_code, country)
VALUES %s RETURNING customer_id""",
customers
)
customer_ids = [row[0] for row in cur.fetchall()]
print(f"Created {len(customer_ids)} customers")
# Generate vendors
print(f"Generating {NUM_VENDORS} vendors...")
vendors = generate_vendors(fake, NUM_VENDORS)
execute_values(
cur,
"""INSERT INTO vendors
(vendor_name, contact_name, email, phone, address, city, state, zip_code, country)
VALUES %s RETURNING vendor_id""",
vendors
)
vendor_ids = [row[0] for row in cur.fetchall()]
print(f"Created {len(vendor_ids)} vendors")
# Generate orders
print(f"Generating {NUM_ORDERS} orders...")
orders = generate_orders(fake, NUM_ORDERS, customer_ids, vendor_ids)
execute_values(
cur,
"""INSERT INTO orders
(customer_id, vendor_id, order_date, ship_date, total_amount, status,
product_name, quantity, unit_price)
VALUES %s""",
orders
)
print(f"Created {NUM_ORDERS} orders")
conn.commit()
# Display summary statistics
print("\n=== Database Seeding Summary ===")
cur.execute("SELECT COUNT(*) FROM customers")
print(f"Total customers: {cur.fetchone()[0]}")
cur.execute("SELECT COUNT(*) FROM vendors")
print(f"Total vendors: {cur.fetchone()[0]}")
cur.execute("SELECT COUNT(*) FROM orders")
print(f"Total orders: {cur.fetchone()[0]}")
cur.execute("SELECT SUM(total_amount) FROM orders")
total_sales = cur.fetchone()[0]
print(f"Total sales amount: ${total_sales:,.2f}")
print("\n=== Sample Queries ===")
cur.execute("""
SELECT status, COUNT(*) as count, SUM(total_amount) as total
FROM orders
GROUP BY status
ORDER BY total DESC
""")
print("\nOrders by status:")
for row in cur.fetchall():
print(f" {row[0]}: {row[1]} orders, ${row[2]:,.2f}")
print("\nDatabase seeded successfully!")
except Exception as e:
conn.rollback()
print(f"Error seeding database: {e}", file=sys.stderr)
raise
finally:
cur.close()
conn.close()
if __name__ == "__main__":
seed_database()