json_extractor.py•2.92 kB
import json
import re
from typing import Dict, Any
def extract_first_json_object(text: str) -> Dict[str, Any]:
"""
Extract the first valid JSON object from text that may contain mixed content.
Handles:
- Clean JSON strings
- JSON wrapped in markdown code fences (```json...```)
- JSON embedded in prose
- Malformed JSON with proper error handling
Args:
text: Input text that may contain JSON
Returns:
Parsed JSON object
Raises:
ValueError: If no valid JSON object is found
"""
if not text or not isinstance(text, str):
raise ValueError("Input text must be a non-empty string")
# Remove markdown code fences
text = re.sub(r"```json\s*\n?", "", text)
text = re.sub(r"```\s*$", "", text)
text = text.strip()
# Fast path: try parsing the entire string as JSON
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Slow path: find the first complete JSON object
return _extract_json_with_parsing(text)
def _extract_json_with_parsing(text: str) -> Dict[str, Any]:
"""
Extract JSON by parsing character by character to find balanced braces.
This implements a state machine that tracks:
- String boundaries (inside/outside quotes)
- Escape sequences (backslash handling)
- Brace depth (opening/closing braces)
The algorithm finds the first complete JSON object by counting balanced braces
while properly handling escaped characters and string literals.
"""
start_idx = -1
brace_depth = 0
in_string = False
escape_next = False
for i, char in enumerate(text):
# Handle escape sequences in strings
if escape_next:
escape_next = False
continue
# Detect backslash for escaping
if char == "\\" and in_string:
escape_next = True
continue
# Toggle string state on unescaped quotes
if char == '"' and not escape_next:
in_string = not in_string
continue
# Only process braces when outside strings
if not in_string:
if char == "{":
# Start tracking a potential JSON object
if start_idx == -1:
start_idx = i
brace_depth += 1
elif char == "}":
brace_depth -= 1
if brace_depth == 0 and start_idx != -1:
# Found complete JSON object - extract and validate
json_str = text[start_idx : i + 1]
try:
return json.loads(json_str)
except json.JSONDecodeError:
# Invalid JSON, continue searching for another object
start_idx = -1
continue
raise ValueError("No valid JSON object found in text")