helper.py•1.04 kB
import os
import glob
import tiktoken
def get_sec_filings_files(ticker: str) -> str:
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(script_dir, '..', 'data', ticker)
files = glob.glob(f'{data_dir}/*.txt')
return files
def get_sec_filings_file_content(file_path: str) -> str:
with open(file_path, 'r') as f:
return f.read()
def write_summary(summary: str, file_path: str):
with open(file_path, 'w') as f:
f.write(summary)
def count_document_tokens(text, model = 'text-embedding-3-small'):
encoding = tiktoken.encoding_for_model(model)
tokens = encoding.encode(text)
num_tokens = len(tokens)
return num_tokens
if __name__ == '__main__':
with open('../data/AAPL/AAPL_10K_2024-11-01.txt', 'r', encoding = 'utf-8') as file:
doc_text = file.read()
token_count = count_document_tokens(doc_text)
print(f'Document has {token_count} tokens')
print(f'Max chunk size available: {8191 - token_count} tokens remaining')