common.py•800 B
from lindera_py import Segmenter, Tokenizer, load_dictionary
from sentence_transformers import CrossEncoder
from transformers import AutoModel, AutoTokenizer
section_data_file = "./output/sections.json"
duckdb_file = "./output/duckdb.db"
device = "cpu"
v_tokenizer = AutoTokenizer.from_pretrained(
"pfnet/plamo-embedding-1b", trust_remote_code=True
)
v_model = AutoModel.from_pretrained(
"pfnet/plamo-embedding-1b", trust_remote_code=True)
v_model = v_model.to(device)
dictionary = load_dictionary("ipadic")
segmenter = Segmenter("normal", dictionary)
tokenizer = Tokenizer(segmenter)
r_model = CrossEncoder(
"hotchpotch/japanese-bge-reranker-v2-m3-v1", max_length=512, device=device
)
def ja_tokens(text: str) -> str:
return " ".join(t.text for t in tokenizer.tokenize(text))