"""Utility functions for file loading and data handling."""
import json
from pathlib import Path
def get_project_root() -> Path:
"""Get the project root directory."""
return Path(__file__).parent.parent
def get_transcripts_dir() -> Path:
"""Get the transcripts directory path."""
return get_project_root() / "transcripts"
def get_preprocessed_dir() -> Path:
"""Get the preprocessed JSON directory path."""
return get_project_root() / "preprocessed"
def get_chroma_dir() -> Path:
"""Get the ChromaDB directory path."""
return get_project_root() / "chroma_db"
def load_transcript(episode_file: str) -> str | None:
"""
Load raw transcript text for an episode.
Args:
episode_file: Filename like "Brian Chesky.txt"
Returns:
Transcript text or None if not found
"""
transcript_path = get_transcripts_dir() / episode_file
if not transcript_path.exists():
return None
return transcript_path.read_text(encoding="utf-8")
def load_preprocessed(episode_file: str) -> dict | None:
"""
Load preprocessed JSON data for an episode.
Args:
episode_file: Filename like "Brian Chesky.txt"
Returns:
Preprocessed data dict or None if not found
"""
# Remove .txt if present and add .json
stem = episode_file.replace(".txt", "")
json_path = get_preprocessed_dir() / f"{stem}.json"
if not json_path.exists():
return None
with open(json_path, "r", encoding="utf-8") as f:
return json.load(f)
def get_transcript_segment(
episode_file: str,
line_start: int,
line_end: int
) -> str | None:
"""
Get a segment of transcript by line numbers.
Args:
episode_file: Filename like "Brian Chesky.txt"
line_start: Starting line (1-indexed)
line_end: Ending line (1-indexed, inclusive)
Returns:
Transcript segment or None if not found
"""
transcript = load_transcript(episode_file)
if transcript is None:
return None
lines = transcript.split("\n")
# Convert to 0-indexed and clamp
start_idx = max(0, line_start - 1)
end_idx = min(len(lines), line_end)
return "\n".join(lines[start_idx:end_idx])
def list_available_episodes() -> list[dict]:
"""
List all available episodes with metadata.
Returns:
List of dicts with episode info
"""
transcripts_dir = get_transcripts_dir()
preprocessed_dir = get_preprocessed_dir()
episodes = []
for transcript_path in sorted(transcripts_dir.glob("*.txt")):
episode_file = transcript_path.name
json_path = preprocessed_dir / f"{transcript_path.stem}.json"
episode_info = {
"filename": episode_file,
"guest": transcript_path.stem,
"preprocessed": json_path.exists(),
}
# Load metadata if preprocessed
if json_path.exists():
try:
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
if "episode" in data:
ep = data["episode"]
episode_info["guest"] = ep.get("guest", episode_info["guest"])
episode_info["expertise_tags"] = ep.get("expertise_tags", [])
episode_info["summary"] = ep.get("summary", "")
except Exception:
pass
episodes.append(episode_info)
return episodes
def get_topic_by_id(episode_file: str, topic_id: str) -> dict | None:
"""
Get a specific topic from preprocessed data.
Args:
episode_file: Filename like "Brian Chesky.txt"
topic_id: Topic ID like "topic_1"
Returns:
Topic dict or None if not found
"""
data = load_preprocessed(episode_file)
if data is None:
return None
for topic in data.get("topics", []):
if topic.get("id") == topic_id:
return topic
return None
def get_insights_for_topic(episode_file: str, topic_id: str) -> list[dict]:
"""
Get all insights for a specific topic.
Args:
episode_file: Filename like "Brian Chesky.txt"
topic_id: Topic ID like "topic_1"
Returns:
List of insight dicts
"""
data = load_preprocessed(episode_file)
if data is None:
return []
return [
insight for insight in data.get("insights", [])
if insight.get("topic_id") == topic_id
]
def get_examples_for_topic(episode_file: str, topic_id: str) -> list[dict]:
"""
Get all examples for a specific topic.
Args:
episode_file: Filename like "Brian Chesky.txt"
topic_id: Topic ID like "topic_1"
Returns:
List of example dicts
"""
data = load_preprocessed(episode_file)
if data is None:
return []
return [
example for example in data.get("examples", [])
if example.get("topic_id") == topic_id
]