cite-finance

ragquery.py•9.45 KiB

from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_chroma import Chroma from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_text_splitters import MarkdownHeaderTextSplitter from langchain.schema import Document from finrobot.data_source.earnings_calls_src import get_earnings_all_docs from finrobot.data_source.filings_src import sec_main as unstructured_sec_main from finrobot.data_source.marker_sec_src.sec_filings_to_pdf import sec_save_pdfs from finrobot.data_source.marker_sec_src.pdf_to_md import run_marker as run_marker_single from finrobot.data_source.marker_sec_src.pdf_to_md_parallel import run_marker_mp from finrobot.data_source.finance_data import get_data from typing import List, Optional import os SAVE_DIR = "output/SEC_EDGAR_FILINGS_MD" def rag_database_earnings_call( ticker: str, year: str)->str: #assert quarter in earnings_call_quarter_vals, "The quarter should be from Q1, Q2, Q3, Q4" earnings_docs, earnings_call_quarter_vals, speakers_list_1, speakers_list_2, speakers_list_3, speakers_list_4 = get_data(ticker=ticker,year=year,data_source='earnings_calls') emb_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1024, chunk_overlap=100, length_function=len,) earnings_calls_split_docs = text_splitter.split_documents(earnings_docs) earnings_call_db = Chroma.from_documents(earnings_calls_split_docs, emb_fn, persist_directory="./earnings-call-db",collection_name="earnings_call") quarter_speaker_dict = { "Q1":speakers_list_1, "Q2":speakers_list_2, "Q3":speakers_list_3, "Q4":speakers_list_4} def query_database_earnings_call( question: str, quarter: str)->str: """This tool will query the earnings call transcripts database for a given question and quarter and it will retrieve the relevant text along from the earnings call and the speaker who addressed the relevant documents. This tool helps in answering questions from the earnings call transcripts. Args: question (str): _description_. Question to query the database for relevant documents. quarter (str): _description_. the financial quarter that is discussed in the question and possible options are Q1, Q2, Q3, Q4 Returns: str: relevant text along from the earnings call and the speaker who addressed the relevant documents """ assert quarter in earnings_call_quarter_vals, "The quarter should be from Q1, Q2, Q3, Q4" req_speaker_list = [] quarter_speaker_list = quarter_speaker_dict[quarter] for sl in quarter_speaker_list: if sl in question or sl.lower() in question: req_speaker_list.append(sl) if len(req_speaker_list) == 0: req_speaker_list = quarter_speaker_list relevant_docs = earnings_call_db.similarity_search( question, k=5, filter={ "$and":[ { "quarter":{"$eq":quarter} }, { "speaker":{"$in":req_speaker_list} } ] } ) speaker_releavnt_dict = {} for doc in relevant_docs: speaker = doc.metadata['speaker'] speaker_text = doc.page_content if speaker not in speaker_releavnt_dict: speaker_releavnt_dict[speaker] = speaker_text else: speaker_releavnt_dict[speaker] += " "+speaker_text relevant_speaker_text = "" for speaker, text in speaker_releavnt_dict.items(): relevant_speaker_text += speaker + ": " relevant_speaker_text += text + "\n\n" return relevant_speaker_text return query_database_earnings_call, earnings_call_quarter_vals, quarter_speaker_dict def rag_database_sec( ticker: str, year: str, FROM_MARKDOWN = False, filing_types = ['10-K','10-Q'])->str: if not FROM_MARKDOWN: sec_data,sec_form_names = get_data(ticker=ticker, year=year,data_source='unstructured',include_amends=True,filing_types=filing_types) emb_fn = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") text_splitter = RecursiveCharacterTextSplitter( chunk_size=1024, chunk_overlap=100, length_function=len,) sec_filings_split_docs = text_splitter.split_documents(sec_data) sec_filings_unstructured_db = Chroma.from_documents(sec_filings_split_docs, emb_fn, persist_directory="./sec-filings-db",collection_name="sec_filings") def query_database_unstructured_sec(question: str,sec_form_name: str)->str: """This tool will query the SEC Filings database for a given question and form name, and it will retrieve the relevant text along from the SEC filings and the section names. This tool helps in answering questions from the sec filings. Args: question (str): _description_. Question to query the database for relevant documents sec_form_name (str): _description_. SEC FORM NAME that the question is talking about. It can be 10-K for yearly data and 10-Q for quarterly data. For quarterly data, it can be 10-Q2 to represent Quarter 2 and similarly for other quarters. Returns: str: Relevant context for the question from the sec filings """ relevant_docs = sec_filings_unstructured_db.similarity_search( question, k=5, filter={ "form_name":{"$eq":sec_form_name} } ) relevant_section_dict = {} for doc in relevant_docs: section = doc.metadata['section_name'] section_text = doc.page_content if section not in relevant_section_dict: relevant_section_dict[section] = section_text else: relevant_section_dict[section] += " "+section_text relevant_section_text = "" for section, text in relevant_section_dict.items(): relevant_section_text += section + ": " relevant_section_text += text + "\n\n" return relevant_section_text return query_database_unstructured_sec, sec_form_names elif FROM_MARKDOWN: sec_data,sec_form_names = get_data(ticker=ticker, year=year,data_source='unstructured',include_amends=True,filing_types=filing_types) get_data(ticker=ticker,year=year,data_source='marker_pdf',batch_processing=False,batch_multiplier=1) headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3"), ] markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on) markdown_dir = "output/SEC_EDGAR_FILINGS_MD" md_content_list = [] for md_dirs in os.listdir(os.path.join(markdown_dir,f"{ticker}-{year}")): md_file_path = os.path.join(markdown_dir,f"{ticker}-{year}",md_dirs,f"{md_dirs}.md") with open(md_file_path, 'r') as file: content = file.read() md_content_list.append([content,'-'.join(md_dirs.split('-')[-2:])]) sec_markdown_docs = [] for md_content in md_content_list: md_header_splits = markdown_splitter.split_text(md_content[0]) for md_header_docs in md_header_splits: md_header_docs.metadata.update({"filing_type":md_content[1]}) sec_markdown_docs.extend(md_header_splits) sec_filings_md_db = Chroma.from_documents(sec_markdown_docs, emb_fn, persist_directory="./sec-filings-md-db",collection_name="sec_filings_md") def query_database_markdown_sec( question: str, sec_form_name: str)->str: """This tool will query the SEC Filings database for a given question and form name, and it will retrieve the relevant text along from the SEC filings and the section names. This tool helps in answering questions from the sec filings. Args: question (str): _description_. Question to query the database for relevant documents sec_form_name (str): _description_. SEC FORM NAME that the question is talking about. It can be 10-K for yearly data and 10-Q for quarterly data. For quarterly data, it can be 10-Q2 to represent Quarter 2 and similarly for other quarters. Returns: str: Relevant context for the question from the sec filings """ assert sec_form_name in sec_form_names, f'The search form type should be in {sec_form_names}' relevant_docs = sec_filings_md_db.similarity_search( question, k=3, filter={ "filing_type":{"$eq":sec_form_name} } ) relevant_section_text = "" for relevant_text in relevant_docs: relevant_section_text += relevant_text.page_content + "\n\n" return relevant_section_text return query_database_markdown_sec, sec_form_names

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Spectating101/cite-finance'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

ragquery.py•9.45 KiB