Skip to main content
Glama
8b-is
by 8b-is
test_format_detector.rs3.15 kB
// Test the Universal Format Detector use anyhow::Result; use st::universal_format_detector::{demo_format_detection, UniversalFormatDetector}; fn main() -> Result<()> { println!("🔍 Universal Format Detector - Structural Analysis Demo\n"); println!("{}\n", "=".repeat(60)); // Run the built-in demo demo_format_detection()?; // Test with a complex mixed format println!("Testing Complex ChatGPT Export Format:\n"); let chatgpt_export = r#"{ "conversations": [ { "id": "abc-123", "create_time": 1234567890.123, "messages": [ { "author": { "role": "user" }, "content": { "parts": ["Can you help me understand tokenization?"] } }, { "author": { "role": "assistant" }, "content": { "parts": ["Tokenization is the process of breaking down text into smaller units called tokens. These tokens can be words, subwords, or even characters, depending on the tokenization strategy used. In natural language processing, tokenization is a fundamental preprocessing step that converts raw text into a format that machine learning models can understand and process.\n\nHere's how it works:\n\n1. **Word-level tokenization**: Splits text by spaces and punctuation\n2. **Subword tokenization**: Breaks words into smaller meaningful units\n3. **Character-level tokenization**: Uses individual characters as tokens\n\nModern language models like GPT use sophisticated tokenization methods like Byte-Pair Encoding (BPE) or SentencePiece to handle diverse languages and reduce vocabulary size while maintaining semantic meaning."] } }, { "author": { "role": "user" }, "content": { "parts": ["That's helpful! Can you show me a code example?"] } } ] } ] }"#; let mut detector = UniversalFormatDetector::new(); let format = detector.detect_format(chatgpt_export); detector.analyze_structure(chatgpt_export)?; println!("Detected format: {:?}\n", format); println!("{}", detector.get_conversation_summary()); if let Some((speaker, bytes)) = detector.get_dominant_speaker() { println!("\nDominant speaker: {} ({} bytes total)", speaker, bytes); } // Show structural tokens let tokens = detector.tokenize_structure(); if !tokens.is_empty() { println!("\n🎯 Structural tokens (most frequent patterns):"); let mut sorted_tokens: Vec<_> = tokens.iter().collect(); sorted_tokens.sort_by_key(|(_, &token)| token); for (pattern, token) in sorted_tokens.iter().take(10) { println!(" 0x{:02X} = {}", token, pattern); } } println!("\n✨ The Magic:"); println!(" • Format detected by STRUCTURE, not keywords"); println!(" • Depth tracking shows nesting (XML/JSON)"); println!(" • Block analysis finds conversations"); println!(" • Pattern tokenization for compression"); println!(" • Works with ANY format!"); Ok(()) }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/8b-is/smart-tree'

If you have feedback or need assistance with the MCP directory API, please join our Discord server