"""JSON Skeleton Generator - Core logic for creating lightweight JSON representations."""
import json
from typing import Any, Dict, List
from pathlib import Path
import hashlib
class SkeletonGenerator:
"""Generate lightweight skeleton representations of JSON data."""
def __init__(self, max_value_length: int = 200):
self.max_value_length = max_value_length
self.truncation_suffix = "...(truncated)"
self.type_only_mode = False
def create_skeleton(self, data: Any) -> Any:
"""
Create a lightweight skeleton with:
- Truncated string values (configurable max length)
- Deduplicated array items (keeping unique DTOs)
"""
return self._process_value(data)
def _process_value(self, value: Any) -> Any:
"""Process a value for the skeleton."""
if self.type_only_mode:
# Return short type names when in type_only mode
if value is None:
return "null"
elif isinstance(value, bool):
return "bool"
elif isinstance(value, int):
return "int"
elif isinstance(value, float):
return "float"
elif isinstance(value, str):
return "str"
elif isinstance(value, list):
return self._process_array(value)
elif isinstance(value, dict):
return self._process_object(value)
else:
return type(value).__name__
# Normal mode - return actual values
if value is None:
return None
elif isinstance(value, bool):
return value
elif isinstance(value, (int, float)):
return value
elif isinstance(value, str):
if len(value) > self.max_value_length:
return value[:self.max_value_length] + self.truncation_suffix
return value
elif isinstance(value, list):
return self._process_array(value)
elif isinstance(value, dict):
return self._process_object(value)
else:
# Handle other types
return str(type(value).__name__)
def _process_array(self, array: List[Any]) -> List[Any]:
"""Process array with deduplication of DTO structures."""
if not array:
return []
if self.type_only_mode:
# In type_only mode, just show the type of the first element
if array:
return [self._process_value(array[0])]
return []
# For primitive arrays, just process the first few unique items
if array and not isinstance(array[0], (dict, list)):
unique_items = []
seen = set()
for item in array:
item_hash = self._hash_value(item)
if item_hash not in seen:
seen.add(item_hash)
unique_items.append(self._process_value(item))
if len(unique_items) >= 3: # Keep max 3 unique primitive examples
break
return unique_items
# For object/array items, deduplicate by structure
unique_structures = []
seen_structures = set()
for item in array:
structure_hash = self._get_structure_hash(item)
if structure_hash not in seen_structures:
seen_structures.add(structure_hash)
unique_structures.append(self._process_value(item))
return unique_structures
def _process_object(self, obj: Dict[str, Any]) -> Dict[str, Any]:
"""Process object recursively."""
result = {}
for key, value in obj.items():
result[key] = self._process_value(value)
return result
def _get_structure_hash(self, value: Any) -> str:
"""Get a hash representing the structure of a value (for DTO deduplication)."""
if isinstance(value, dict):
# Create a structure representation based on keys and value types
structure = {k: self._get_type_signature(v) for k, v in sorted(value.items())}
return hashlib.md5(json.dumps(structure, sort_keys=True).encode()).hexdigest()
elif isinstance(value, list):
# For arrays, hash the structure of the first item (if exists)
if value:
return f"array_of_{self._get_structure_hash(value[0])}"
return "empty_array"
else:
return self._get_type_signature(value)
def _get_type_signature(self, value: Any) -> str:
"""Get type signature of a value for structure comparison."""
if value is None:
return "null"
elif isinstance(value, bool):
return "boolean"
elif isinstance(value, (int, float)):
return "number"
elif isinstance(value, str):
return "string"
elif isinstance(value, list):
return "array"
elif isinstance(value, dict):
return "object"
else:
return type(value).__name__
def _hash_value(self, value: Any) -> str:
"""Hash a primitive value for deduplication."""
return hashlib.md5(str(value).encode()).hexdigest()
def process_file(self, file_path: str, max_length: int = None, type_only: bool = False) -> Dict[str, Any]:
"""Process a JSON file and return skeleton."""
# Update max_length if provided
if max_length is not None:
self.max_value_length = max_length
# Set type_only mode
self.type_only_mode = type_only
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if not path.is_file():
raise ValueError(f"Path is not a file: {file_path}")
try:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON file: {e}")
except Exception as e:
raise Exception(f"Error reading file: {e}")
skeleton = self.create_skeleton(data)
return {
"file_path": str(path.absolute()),
"file_size": path.stat().st_size,
"skeleton": skeleton
}