extract_spectrum_data.pyā¢5.12 kB
#!/usr/bin/env python3
"""
Extract absorption spectrum data from transmission.png
This script processes the image to extract (wavelength, absorbance) data points
"""
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import json
def extract_spectrum_data(image_path, max_points=100):
"""
Extract spectrum data from a plot image
Args:
image_path: Path to the plot image
max_points: Maximum number of data points to extract
Returns:
dict: JSON-like structure with extracted data points
"""
# Load and convert image to numpy array
img = Image.open(image_path)
img_array = np.array(img)
# Convert to grayscale if needed
if len(img_array.shape) == 3:
# Convert RGB to grayscale using standard weights
img_gray = np.dot(img_array[...,:3], [0.2989, 0.5870, 0.1140])
else:
img_gray = img_array
# Get image dimensions
height, width = img_gray.shape
# Define plot area (approximate boundaries based on typical matplotlib plots)
# These values may need adjustment based on the specific image
left_margin = int(width * 0.12) # Left axis area
right_margin = int(width * 0.95) # Right boundary
top_margin = int(height * 0.12) # Top boundary
bottom_margin = int(height * 0.88) # Bottom axis area
plot_width = right_margin - left_margin
plot_height = bottom_margin - top_margin
# Extract the plot region
plot_region = img_gray[top_margin:bottom_margin, left_margin:right_margin]
# Find the darkest pixels in each column (representing the data line)
data_points = []
# Sample columns evenly across the plot width
x_indices = np.linspace(0, plot_width-1, min(max_points, plot_width), dtype=int)
for i, x_idx in enumerate(x_indices):
column = plot_region[:, x_idx]
# Find the minimum value (darkest pixel) in this column
# The data line should be the darkest part
min_idx = np.argmin(column)
# Convert pixel coordinates to data coordinates
# X-axis: wavelength from 400 to 700 nm
wavelength = 400 + (x_idx / (plot_width - 1)) * (700 - 400)
# Y-axis: absorbance from 0 to 1.2 (inverted because image y=0 is at top)
absorbance = 1.2 * (1 - min_idx / (plot_height - 1))
data_points.append({
"x_value": round(wavelength, 2),
"y_value": round(max(0, absorbance), 4) # Ensure non-negative values
})
# Create the output structure similar to axiomatic-plots format
result = {
"series_points": [
{
"series_unique_id": 0,
"points": data_points
}
]
}
return result
def smooth_data(data_points, window_size=5):
"""
Apply simple moving average smoothing to the data
"""
if len(data_points) < window_size:
return data_points
smoothed = []
for i in range(len(data_points)):
if i < window_size // 2 or i >= len(data_points) - window_size // 2:
smoothed.append(data_points[i])
else:
window = data_points[i - window_size//2 : i + window_size//2 + 1]
avg_y = np.mean([p["y_value"] for p in window])
smoothed.append({
"x_value": data_points[i]["x_value"],
"y_value": round(avg_y, 4)
})
return smoothed
def main():
image_path = "/Users/englund/Projects/2025-ai-experiments-MCPs/plot_extraction/transmission.png"
print("Extracting spectrum data from transmission.png...")
# Extract data points
result = extract_spectrum_data(image_path, max_points=100)
# Apply smoothing to reduce noise
result["series_points"][0]["points"] = smooth_data(
result["series_points"][0]["points"],
window_size=3
)
# Save to JSON file
output_file = "/Users/englund/Projects/2025-ai-experiments-MCPs/plot_extraction/extracted_spectrum_data.json"
with open(output_file, 'w') as f:
json.dump(result, f, indent=2)
print(f"Data extracted and saved to {output_file}")
print(f"Number of data points: {len(result['series_points'][0]['points'])}")
# Display some sample points
points = result["series_points"][0]["points"]
print("\nSample data points:")
for i in range(0, len(points), len(points)//10):
p = points[i]
print(f" Wavelength: {p['x_value']} nm, Absorbance: {p['y_value']}")
# Find peaks
y_values = [p["y_value"] for p in points]
max_abs = max(y_values)
max_idx = y_values.index(max_abs)
peak_wavelength = points[max_idx]["x_value"]
print(f"\nSpectrum characteristics:")
print(f" Maximum absorbance: {max_abs} at {peak_wavelength} nm")
print(f" Wavelength range: {points[0]['x_value']} - {points[-1]['x_value']} nm")
print(f" Absorbance range: {min(y_values)} - {max(y_values)}")
return result
if __name__ == "__main__":
main()