Skip to main content
Glama
formatter.rs13.6 kB
use serde_json::{json, Value}; use regex::Regex; /// Maximum size of a block in the Notion API const MAX_BLOCK_SIZE: usize = 2000; /// Split content into chunks to respect Notion API limits /// /// # Arguments /// /// * `text` - Text to be split /// * `max_length` - Maximum length of each chunk (default: 2000 characters) /// /// # Returns /// /// * `Vec<String>` - List of text chunks pub fn split_content(text: &str, max_length: usize) -> Vec<String> { if text.len() <= max_length { return vec![text.to_string()]; } // Try to split by headers let header_regex = Regex::new(r"(?m)^(#{1,3}\s.+)$").unwrap(); let headers: Vec<_> = header_regex.find_iter(text).collect(); if headers.is_empty() { // No headers, use simple method return simple_split(text, max_length); } let mut parts = Vec::new(); let mut last_pos = 0; let mut current_chunk = String::new(); // Process headers as splitting points for (i, header_match) in headers.iter().enumerate() { // Get content from last point to current header if i > 0 { let header_content = &text[last_pos..header_match.start()]; // If adding this header section would exceed max size, // start a new chunk if current_chunk.len() + header_content.len() > max_length { parts.push(current_chunk.clone()); current_chunk = header_content.to_string(); } else { current_chunk.push_str(header_content); } } // First header or after a split if current_chunk.is_empty() { current_chunk = text[header_match.start()..].to_string(); // If still too large, we'll need to split it later } last_pos = header_match.start(); } // Add final chunk if last_pos < text.len() { let final_content = &text[last_pos..]; if current_chunk.len() + final_content.len() > max_length { parts.push(current_chunk.clone()); parts.push(final_content.to_string()); } else { current_chunk.push_str(final_content); parts.push(current_chunk.clone()); } } else if !current_chunk.is_empty() { parts.push(current_chunk.clone()); } // If any chunk is still too large, split it further let mut result = Vec::new(); for chunk in parts { if chunk.len() > max_length { result.extend(simple_split(&chunk, max_length)); } else { result.push(chunk); } } result } /// Fallback method to split text without headers /// /// # Arguments /// /// * `text` - Text to be split /// * `max_length` - Maximum length of each chunk /// /// # Returns /// /// * `Vec<String>` - List of text chunks fn simple_split(text: &str, max_length: usize) -> Vec<String> { // For pure text without natural splitting points (like headers), // we need to ensure we're breaking the content into chunks that respect the max_length let mut chunks = Vec::new(); let mut current_chunk = String::new(); let mut in_code_block = false; let mut code_block_content = String::new(); // If text is just a repeating character (like in tests) with no structure, // split it into equal chunks first if text.chars().all(|c| c == text.chars().next().unwrap()) && text.len() > max_length { let mut remaining = text.to_string(); while !remaining.is_empty() { let split_point = std::cmp::min(max_length, remaining.len()); chunks.push(remaining[..split_point].to_string()); remaining = remaining[split_point..].to_string(); } return chunks; } for line in text.split('\n') { // Check for code block markers if line.trim().starts_with("```") { in_code_block = !in_code_block; // If we're starting a code block if in_code_block { code_block_content = format!("{}{}", line, "\n"); continue; } else { // We're ending a code block, add it as a whole code_block_content.push_str(line); if current_chunk.len() + code_block_content.len() > max_length { // If adding the whole block exceeds the limit, // finalize the current chunk and start a new one if !current_chunk.is_empty() { chunks.push(current_chunk.clone()); } chunks.push(code_block_content.clone()); current_chunk = String::new(); } else { current_chunk.push_str(&code_block_content); } code_block_content = String::new(); continue; } } // If we're inside a code block, collect the content if in_code_block { code_block_content.push_str(line); code_block_content.push('\n'); continue; } // For regular lines let line_with_newline = format!("{}{}", line, "\n"); if current_chunk.len() + line_with_newline.len() > max_length { if !current_chunk.is_empty() { chunks.push(current_chunk.clone()); } current_chunk = line_with_newline; } else { current_chunk.push_str(&line_with_newline); } } // Add any remaining content if !code_block_content.is_empty() { if current_chunk.len() + code_block_content.len() > max_length { if !current_chunk.is_empty() { chunks.push(current_chunk.clone()); } chunks.push(code_block_content.clone()); } else { current_chunk.push_str(&code_block_content); } } if !current_chunk.is_empty() { chunks.push(current_chunk.clone()); } // Check if any chunk still exceeds max_length (this could happen with very long lines) // and split it further if needed let mut final_chunks = Vec::new(); for chunk in chunks { if chunk.len() <= max_length { final_chunks.push(chunk); } else { let mut remaining = chunk; while !remaining.is_empty() { let split_point = std::cmp::min(max_length, remaining.len()); final_chunks.push(remaining[..split_point].to_string()); remaining = remaining[split_point..].to_string(); } } } final_chunks } /// Convert Markdown text into Notion blocks /// /// # Arguments /// /// * `text` - Markdown text to be converted /// /// # Returns /// /// * `Vec<Value>` - List of Notion blocks pub fn format_for_notion(text: &str) -> Vec<Value> { let lines: Vec<&str> = text.split('\n').collect(); let mut blocks = Vec::new(); let mut current_code_block: Option<Value> = None; let mut i = 0; while i < lines.len() { let line = lines[i].trim_end(); i += 1; // Detect start of code block (```language) if let Some(code_lang) = line.strip_prefix("```") { if current_code_block.is_none() { // Start a new code block current_code_block = Some(json!({ "type": "code", "code": { "rich_text": [], "language": get_valid_notion_language(code_lang.trim()) } })); continue; } } // Detect end of code block if line.trim() == "```" && current_code_block.is_some() { // Close the current code block blocks.push(current_code_block.take().unwrap()); continue; } // Add lines to current code block if let Some(ref mut code_block) = current_code_block { let line_with_newline = format!("{}{}", line, "\n"); code_block["code"]["rich_text"].as_array_mut().unwrap().push(json!({ "type": "text", "text": { "content": line_with_newline } })); continue; } // Ignore empty lines outside code blocks if line.trim().is_empty() { // Add a paragraph with a newline for spacing blocks.push(json!({ "type": "paragraph", "paragraph": { "rich_text": [] } })); continue; } // Headers if line.starts_with("# ") { blocks.push(json!({ "type": "heading_1", "heading_1": { "rich_text": [{ "text": { "content": &line[2..] } }] } })); } else if line.starts_with("## ") { blocks.push(json!({ "type": "heading_2", "heading_2": { "rich_text": [{ "text": { "content": &line[3..] } }] } })); } else if line.starts_with("### ") { blocks.push(json!({ "type": "heading_3", "heading_3": { "rich_text": [{ "text": { "content": &line[4..] } }] } })); } // Bulleted list else if line.starts_with("- ") || line.starts_with("* ") { let content = &line[2..]; blocks.push(json!({ "type": "bulleted_list_item", "bulleted_list_item": { "rich_text": [{ "text": { "content": content } }] } })); } // Numbered list else if Regex::new(r"^\d+\.\s").unwrap().is_match(line) { let content = Regex::new(r"^\d+\.\s").unwrap().replace(line, ""); blocks.push(json!({ "type": "numbered_list_item", "numbered_list_item": { "rich_text": [{ "text": { "content": content } }] } })); } // Regular paragraphs else { blocks.push(json!({ "type": "paragraph", "paragraph": { "rich_text": [{ "text": { "content": line } }] } })); } } // Close any remaining code block if let Some(code_block) = current_code_block { blocks.push(code_block); } blocks } fn get_valid_notion_language(language: &str) -> &str { // List of languages supported by the Notion API let valid_languages = [ "abap", "agda", "arduino", "assembly", "bash", "basic", "c", "c#", "c++", "clojure", "coffeescript", "css", "dart", "diff", "docker", "elixir", "elm", "erlang", "f#", "flow", "fortran", "go", "graphql", "groovy", "haskell", "html", "java", "javascript", "json", "julia", "kotlin", "latex", "less", "lisp", "lua", "makefile", "markdown", "matlab", "mermaid", "nix", "objective-c", "ocaml", "pascal", "perl", "php", "python", "r", "ruby", "rust", "scala", "scheme", "scss", "shell", "sql", "swift", "typescript", "vb.net", "verilog", "vhdl", "xml", "yaml" ]; // Normalize the language name let normalized = language.trim().to_lowercase(); if valid_languages.contains(&normalized.as_str()) { for &valid in &valid_languages { if valid == normalized { return valid; } } } if normalized.is_empty() { return "plain text"; } else { // Try to find a close match match normalized.as_str() { "js" | "jsx" => "javascript", "py" => "python", "ts" | "tsx" => "typescript", "sh" | "zsh" => "shell", "md" => "markdown", _ => "plain text" } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_split_content_small_text() { let text = "This is a small text."; let chunks = split_content(text, 100); assert_eq!(chunks.len(), 1); assert_eq!(chunks[0], text); } #[test] fn test_split_content_large_text_no_headers() { let text = "A".repeat(3000); // Large text without headers let chunks = split_content(&text, 1000); assert!(chunks.len() > 1); for chunk in chunks { assert!(chunk.len() <= 1000); } } #[test] fn test_split_content_with_headers() { let text = format!( "# Title 1\n{}\n\n## Title 2\n{}\n\n### Title 3\n{}", "A".repeat(900), "B".repeat(900), "C".repeat(900) ); let chunks = split_content(&text, 1000); assert!(chunks.len() > 1); for chunk in chunks { assert!(chunk.len() <= 1000); } } #[test] fn test_format_for_notion_heading() { let text = "# Main Title"; let blocks = format_for_notion(text); assert_eq!(blocks.len(), 1); assert_eq!(blocks[0]["type"], "heading_1"); assert_eq!(blocks[0]["heading_1"]["rich_text"][0]["text"]["content"], "Main Title"); } #[test] fn test_format_for_notion_paragraph() { let text = "This is a normal paragraph."; let blocks = format_for_notion(text); assert_eq!(blocks.len(), 1); assert_eq!(blocks[0]["type"], "paragraph"); assert_eq!(blocks[0]["paragraph"]["rich_text"][0]["text"]["content"], text); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Harry-027/JotDown'

If you have feedback or need assistance with the MCP directory API, please join our Discord server