Memex

Overview Schema Related Servers Score Discussions

memex
memex-rs
src
embedding

mod.rs

mod.rs•8.93 KiB

//! 文本处理模块 //! //! 提供文本分片（Chunker）等工具 //! //! 注意：LLM 能力（Embedding、Chat）已迁移到 `llm` 模块 /// 文本分片器 /// /// 将长文本智能分片，用于向量索引 #[derive(Clone)] pub struct Chunker { max_length: usize, overlap: usize, min_length: usize, } impl Default for Chunker { fn default() -> Self { Self { max_length: 2000, overlap: 200, min_length: 100, } } } impl Chunker { /// 对文本进行分片 pub fn chunk(&self, content: &str) -> Vec<Chunk> { // 空内容处理 if content.trim().is_empty() { return vec![Chunk { index: 0, content: "empty".to_string(), chunk_type: ChunkType::Text, }]; } // 短文本不分片 if content.len() <= self.max_length { return vec![Chunk { index: 0, content: content.trim().to_string(), chunk_type: ChunkType::Text, }]; } // 分离代码块和文本 let segments = self.separate_code_and_text(content); let mut all_chunks = Vec::new(); for segment in segments { match segment.segment_type { SegmentType::Code => { if segment.content.len() > self.max_length { let code_chunks = self.split_by_length(&segment.content); for c in code_chunks { if c.len() >= self.min_length { all_chunks.push(Chunk { index: 0, content: c, chunk_type: ChunkType::Code, }); } } } else if segment.content.trim().len() >= self.min_length { all_chunks.push(Chunk { index: 0, content: segment.content.trim().to_string(), chunk_type: ChunkType::Code, }); } } SegmentType::Text => { let text_chunks = self.split_text_by_paragraph(&segment.content); all_chunks.extend(text_chunks); } } } // 安全回退 if all_chunks.is_empty() { return vec![Chunk { index: 0, content: content.trim().to_string(), chunk_type: ChunkType::Text, }]; } // 重新编号 for (i, chunk) in all_chunks.iter_mut().enumerate() { chunk.index = i; } all_chunks } fn separate_code_and_text(&self, content: &str) -> Vec<Segment> { let mut segments = Vec::new(); let code_block_regex = regex::Regex::new(r"```[\s\S]*?```").unwrap(); let mut last_index = 0; for mat in code_block_regex.find_iter(content) { // 代码块之前的文本 if mat.start() > last_index { let text_before = &content[last_index..mat.start()]; if !text_before.trim().is_empty() { segments.push(Segment { segment_type: SegmentType::Text, content: text_before.to_string(), }); } } // 代码块本身 segments.push(Segment { segment_type: SegmentType::Code, content: mat.as_str().to_string(), }); last_index = mat.end(); } // 最后的文本 if last_index < content.len() { let text_after = &content[last_index..]; if !text_after.trim().is_empty() { segments.push(Segment { segment_type: SegmentType::Text, content: text_after.to_string(), }); } } // 如果没有代码块 if segments.is_empty() { segments.push(Segment { segment_type: SegmentType::Text, content: content.to_string(), }); } segments } fn split_text_by_paragraph(&self, text: &str) -> Vec<Chunk> { let mut chunks = Vec::new(); let paragraphs: Vec<&str> = text .split("\n\n") .filter(|p| !p.trim().is_empty()) .collect(); let mut current_chunk = String::new(); for para in paragraphs { let trimmed = para.trim(); if trimmed.len() > self.max_length { // 保存之前的 if current_chunk.trim().len() >= self.min_length { chunks.push(Chunk { index: 0, content: current_chunk.trim().to_string(), chunk_type: ChunkType::Text, }); current_chunk.clear(); } // 超长段落分割 for c in self.split_by_length(trimmed) { if c.len() >= self.min_length { chunks.push(Chunk { index: 0, content: c, chunk_type: ChunkType::Text, }); } } continue; } let combined = if current_chunk.is_empty() { trimmed.to_string() } else { format!("{}\n\n{}", current_chunk, trimmed) }; if combined.len() > self.max_length { if current_chunk.trim().len() >= self.min_length { chunks.push(Chunk { index: 0, content: current_chunk.trim().to_string(), chunk_type: ChunkType::Text, }); } current_chunk = trimmed.to_string(); } else { current_chunk = combined; } } // 最后的 if current_chunk.trim().len() >= self.min_length { chunks.push(Chunk { index: 0, content: current_chunk.trim().to_string(), chunk_type: ChunkType::Text, }); } chunks } fn split_by_length(&self, text: &str) -> Vec<String> { let mut chunks = Vec::new(); let mut start = 0; while start < text.len() { let mut end = (start + self.max_length).min(text.len()); // 确保 end 在字符边界上 while end > start && !text.is_char_boundary(end) { end -= 1; } // 尝试在句子边界切分 if end < text.len() { let mut search_start = (start + self.max_length).saturating_sub(100).max(start); // 确保 search_start 在字符边界上 while search_start < end && !text.is_char_boundary(search_start) { search_start += 1; } let search_text = &text[search_start..end]; let boundaries = ['\n', '。', '！', '？', '.', '!', '?']; let mut best = None; for boundary in boundaries { if let Some(idx) = search_text.rfind(boundary) { if best.map(|b| idx > b).unwrap_or(true) { best = Some(idx); } } } if let Some(idx) = best { let mut new_end = search_start + idx + 1; // 确保新的 end 在字符边界上（边界字符可能是多字节的，如 '。'） while new_end < text.len() && !text.is_char_boundary(new_end) { new_end += 1; } end = new_end; } } let chunk = text[start..end].trim(); if chunk.len() >= self.min_length { chunks.push(chunk.to_string()); } let prev_start = start; start = end.saturating_sub(self.overlap); // 确保 start 在字符边界上 while start < text.len() && !text.is_char_boundary(start) { start += 1; } if start <= prev_start { start = end; } } chunks } } /// 文本分片 #[derive(Debug, Clone)] pub struct Chunk { pub index: usize, pub content: String, pub chunk_type: ChunkType, } /// 分片类型 #[derive(Debug, Clone, Copy)] pub enum ChunkType { Text, Code, } struct Segment { segment_type: SegmentType, content: String, } enum SegmentType { Text, Code, }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vimo-ai/memex'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

mod.rs•8.93 KiB