test_json_extractor.py•4.24 kB
import pytest
from utils.json_extractor import extract_first_json_object
class TestJSONExtractor:
"""Test JSON extraction from mixed content"""
def test_clean_json(self):
"""Test extraction from clean JSON string"""
json_str = '{"cleaned": "test", "notes": ["note1"], "unchanged": false}'
result = extract_first_json_object(json_str)
assert result == {"cleaned": "test", "notes": ["note1"], "unchanged": False}
def test_json_with_code_fence(self):
"""Test extraction from JSON wrapped in markdown code fence"""
json_str = '```json\n{"cleaned": "test", "notes": ["note1"]}\n```'
result = extract_first_json_object(json_str)
assert result == {"cleaned": "test", "notes": ["note1"]}
def test_json_embedded_in_prose(self):
"""Test extraction from JSON embedded in prose"""
text = 'Here is the response: {"cleaned": "test", "score": 5} and some additional text.'
result = extract_first_json_object(text)
assert result == {"cleaned": "test", "score": 5}
def test_multiple_json_objects(self):
"""Test extraction of first JSON object when multiple exist"""
text = 'First: {"a": 1} Second: {"b": 2}'
result = extract_first_json_object(text)
assert result == {"a": 1}
def test_nested_objects_and_arrays(self):
"""Test extraction with nested objects and arrays"""
json_str = '{"cleaned": "test", "quality": {"score": 4, "reasons": ["clear", "specific"]}, "notes": ["note1", "note2"]}'
result = extract_first_json_object(json_str)
expected = {
"cleaned": "test",
"quality": {"score": 4, "reasons": ["clear", "specific"]},
"notes": ["note1", "note2"],
}
assert result == expected
def test_escaped_characters_in_strings(self):
"""Test extraction with escaped characters in strings"""
json_str = '{"cleaned": "test with \\"quotes\\" and \\n newlines", "notes": []}'
result = extract_first_json_object(json_str)
assert result == {"cleaned": 'test with "quotes" and \n newlines', "notes": []}
def test_empty_string(self):
"""Test extraction from empty string"""
with pytest.raises(ValueError, match="Input text must be a non-empty string"):
extract_first_json_object("")
def test_none_input(self):
"""Test extraction from None input"""
with pytest.raises(ValueError, match="Input text must be a non-empty string"):
extract_first_json_object(None)
def test_no_json_found(self):
"""Test extraction when no JSON is found"""
with pytest.raises(ValueError, match="No valid JSON object found"):
extract_first_json_object("This is just plain text with no JSON")
def test_malformed_json(self):
"""Test extraction with malformed JSON"""
with pytest.raises(ValueError, match="No valid JSON object found"):
extract_first_json_object('{"cleaned": "test", "notes": [unclosed array')
def test_json_with_whitespace(self):
"""Test extraction with various whitespace"""
json_str = ' \n {"cleaned": "test", "notes": []} \n '
result = extract_first_json_object(json_str)
assert result == {"cleaned": "test", "notes": []}
def test_complex_nested_structure(self):
"""Test extraction with complex nested structure"""
json_str = """
{
"cleaned": "Create a REST API",
"notes": ["Added specific requirements"],
"quality": {
"score": 4,
"reasons": ["clear", "actionable"]
},
"open_questions": ["What framework?", "What database?"],
"risks": ["May need more context"]
}
"""
result = extract_first_json_object(json_str)
expected = {
"cleaned": "Create a REST API",
"notes": ["Added specific requirements"],
"quality": {"score": 4, "reasons": ["clear", "actionable"]},
"open_questions": ["What framework?", "What database?"],
"risks": ["May need more context"],
}
assert result == expected