main_setup.py•2.54 kB
import json
import os
import requests
from lxml import html
import json
import duckdb
import torch
from common import duckdb_file, ja_tokens, section_data_file, v_model, v_tokenizer
url = "https://ja.react.dev/learn/javascript-in-jsx-with-curly-braces"
def extract_section():
print("Hello, World!")
response = requests.get(url)
print(response.status_code)
tree = html.fromstring(response.content)
sections = []
for h2 in tree.xpath("//h2"):
hrefs = h2.xpath(".//a/@href")
link = url + hrefs[0] if hrefs else None
title = h2.text_content().strip()
content = []
for sib in h2.itersiblings():
if sib.tag == "h2":
break
content.append(sib.text_content().strip())
sections.append({"title": title, "contents": content, "link": link})
print(sections)
with open(section_data_file, "w", encoding="utf-8") as f:
json.dump(sections, f, indent=4, ensure_ascii=False)
def setup_duckdb():
print("Hello, World!")
if os.path.exists(duckdb_file):
os.remove(duckdb_file)
conn = duckdb.connect(duckdb_file)
conn.install_extension("vss")
conn.load_extension("vss")
conn.install_extension("fts")
conn.load_extension("fts")
conn.sql("CREATE SEQUENCE IF NOT EXISTS id_sequence START 1;")
conn.sql("""
CREATE TABLE sora_doc (
id INTEGER DEFAULT nextval('id_sequence') PRIMARY KEY,
content VARCHAR,
content_v FLOAT[2048],
content_t VARCHAR
);
""")
with open(section_data_file, "r", encoding="utf-8") as f:
sections = json.load(f)
docs = ["".join(section["contents"]) for section in sections]
with torch.inference_mode():
for doc, doc_embedding in zip(docs, v_model.encode_document(docs, v_tokenizer)):
conn.execute(
"INSERT INTO sora_doc (content, content_v, content_t) VALUES (?, ?, ?)",
[
doc,
doc_embedding.cpu().squeeze().numpy().tolist(),
ja_tokens(doc),
],
)
conn.sql("""
PRAGMA create_fts_index(
'sora_doc',
'id',
'content_t',
stemmer = 'none',
stopwords = 'none',
ignore = '',
lower = false,
strip_accents = false
);
""")
def main():
extract_section()
setup_duckdb()
if __name__ == "__main__":
main()