from typing import Any, Dict, List
from langchain_core.messages import AIMessage, AnyMessage, HumanMessage
def get_research_topic(messages: List[AnyMessage]) -> str:
"""
Get the research topic from the messages.
"""
# check if request has a history and combine the messages into a single string
if len(messages) == 1:
research_topic = messages[-1].content
else:
research_topic = ""
for message in messages:
if isinstance(message, HumanMessage):
research_topic += f"User: {message.content}\n"
elif isinstance(message, AIMessage):
research_topic += f"Assistant: {message.content}\n"
return research_topic
def resolve_urls(urls_to_resolve: List[Any], id: int) -> Dict[str, str]:
"""
Create a map of the vertex ai search urls (very long) to a short url with a unique id for each url.
Ensures each original URL gets a consistent shortened form while maintaining uniqueness.
"""
prefix = f"https://vertexaisearch.cloud.google.com/id/"
urls = [site["web"]["uri"] for site in urls_to_resolve]
# Create a dictionary that maps each unique URL to its first occurrence index
resolved_map = {}
for idx, url in enumerate(urls):
if url not in resolved_map:
resolved_map[url] = f"{prefix}{id}-{idx}"
return resolved_map
def insert_citation_markers(text, citations_list):
"""
Inserts citation markers into a text string based on start and end indices.
Args:
text (str): The original text string.
citations_list (list): A list of dictionaries, where each dictionary
contains 'start_index', 'end_index', and
'segment_string' (the marker to insert).
Indices are assumed to be for the original text.
Returns:
str: The text with citation markers inserted.
"""
# Sort citations by end_index in descending order.
# If end_index is the same, secondary sort by start_index descending.
# This ensures that insertions at the end of the string don't affect
# the indices of earlier parts of the string that still need to be processed.
sorted_citations = sorted(
citations_list, key=lambda c: (c["end_index"], c["start_index"]), reverse=True
)
modified_text = text
for citation_info in sorted_citations:
# These indices refer to positions in the *original* text,
# but since we iterate from the end, they remain valid for insertion
# relative to the parts of the string already processed.
end_idx = citation_info["end_index"]
marker_to_insert = ""
for segment in citation_info["segments"]:
marker_to_insert += f" [{segment['label']}]({segment['short_url']})"
# Insert the citation marker at the original end_idx position
modified_text = (
modified_text[:end_idx] + marker_to_insert + modified_text[end_idx:]
)
return modified_text
def get_citations(response, resolved_urls_map):
"""
Extracts and formats citation information from a Gemini model's response.
This function processes the grounding metadata provided in the response to
construct a list of citation objects. Each citation object includes the
start and end indices of the text segment it refers to, and a string
containing formatted markdown links to the supporting web chunks.
Args:
response: The response object from LangChain's ChatGoogleGenerativeAI, expected
to have a structure including response_metadata["grounding_metadata"].
resolved_urls_map: A dictionary mapping original URLs to resolved URLs.
Returns:
list: A list of dictionaries, where each dictionary represents a citation
and has the following keys:
- "start_index" (int): The starting character index of the cited
segment in the original text. Defaults to 0
if not specified.
- "end_index" (int): The character index immediately after the
end of the cited segment (exclusive).
- "segments" (list[str]): A list of individual markdown-formatted
links for each grounding chunk.
- "segment_string" (str): A concatenated string of all markdown-
formatted links for the citation.
Returns an empty list if no valid grounding supports are found, or if
essential data is missing.
"""
citations = []
# Ensure response and necessary nested structures are present
if not response:
return citations
if (
"grounding_metadata" not in response.response_metadata
or not response.response_metadata["grounding_metadata"]
or "grounding_supports" not in response.response_metadata["grounding_metadata"]
):
return citations
grounding_metadata = response.response_metadata["grounding_metadata"]
for support in grounding_metadata["grounding_supports"]:
citation = {}
# Ensure segment information is present
if "segment" not in support or not support["segment"]:
continue # Skip this support if segment info is missing
start_index = (
support["segment"]["start_index"]
if support["segment"]["start_index"] is not None
else 0
)
# Ensure end_index is present to form a valid segment
if support["segment"]["end_index"] is None:
continue # Skip if end_index is missing, as it's crucial
# Add 1 to end_index to make it an exclusive end for slicing/range purposes
# (assuming the API provides an inclusive end_index)
citation["start_index"] = start_index
citation["end_index"] = support["segment"]["end_index"]
citation["segments"] = []
if "grounding_chunk_indices" in support and support["grounding_chunk_indices"]:
for ind in support["grounding_chunk_indices"]:
try:
chunk = grounding_metadata["grounding_chunks"][ind]
resolved_url = resolved_urls_map.get(chunk["web"]["uri"], None)
citation["segments"].append(
{
"label": chunk["web"]["title"].split(".")[:-1][0],
"short_url": resolved_url,
"value": chunk["web"]["uri"],
}
)
except (IndexError, AttributeError, NameError):
# Handle cases where chunk, web, uri, or resolved_map might be problematic
# For simplicity, we'll just skip adding this particular segment link
# In a production system, you might want to log this.
pass
citations.append(citation)
return citations