max_tokens.pyā¢1.41 kB
from typing import Union
from tiktoken import get_encoding
def tokenizer(
text: str,
max_tokens: int=128_000,
encoding_name: str = "cl100k_base"
) -> Union[tuple[str, int, bool], str]:
"""
Tokenizes a string and, if necessary, truncates it to a maximum number of tokens.
Args:
text (str): The input text to tokenize.
max_tokens (int): The maximum number of tokens to return.
encoding_name (str, optional): The encoding to use. Defaults to "cl100k_base".
Returns:
tuple: A tuple containing:
- output_text (str): The output text, truncated if necessary to max_tokens tokens.
- original_token_count (int): Number of tokens in the original text.
- was_truncated (bool): True if truncation occurred.
Raises:
Exception: For tiktoken errors.
"""
try:
encoding = get_encoding(encoding_name)
tokens = encoding.encode(text)
original_token_count = len(tokens)
if original_token_count > max_tokens:
output_tokens = tokens[:max_tokens]
was_truncated = True
else:
output_tokens = tokens
was_truncated = False
output_text = encoding.decode(output_tokens)
return output_text, original_token_count, was_truncated
except Exception as error:
return f"Error processing tiktoken: {error}"