Local RAG

Overview Schema Related Servers Score Discussions

title-extractor.test.ts•9.12 KiB

// Title Extractor Unit Tests // Test Type: Unit Test import { describe, expect, it } from 'vitest' import { extractDocxTitle, extractHtmlTitle, extractMarkdownTitle, extractPdfTitle, extractTxtTitle, fileNameToTitle, } from '../title-extractor.js' // ============================================ // Tests // ============================================ describe('Title Extractor', () => { // -------------------------------------------- // fileNameToTitle helper // -------------------------------------------- describe('fileNameToTitle', () => { it('should strip extension and replace hyphens/underscores with spaces', () => { expect(fileNameToTitle('2024-annual-report.pdf')).toBe('2024 annual report') }) it('should handle file names with multiple dots', () => { expect(fileNameToTitle('report.v2.final.pdf')).toBe('report.v2.final') }) it('should handle file names with underscores', () => { expect(fileNameToTitle('my_document_title.md')).toBe('my document title') }) it('should handle file names with mixed hyphens and underscores', () => { expect(fileNameToTitle('project-plan_v2.txt')).toBe('project plan v2') }) it('should handle file names with no extension', () => { expect(fileNameToTitle('README')).toBe('README') }) }) // -------------------------------------------- // extractMarkdownTitle // -------------------------------------------- describe('extractMarkdownTitle', () => { it('should extract title from YAML frontmatter', () => { const text = '---\ntitle: My Document\ndate: 2024-01-01\n---\n\nContent here.' const result = extractMarkdownTitle(text, 'test.md') expect(result.title).toBe('My Document') expect(result.source).toBe('metadata') }) it('should extract title from YAML frontmatter with double quotes', () => { const text = '---\ntitle: "My Quoted Document"\n---\n\nContent here.' const result = extractMarkdownTitle(text, 'test.md') expect(result.title).toBe('My Quoted Document') expect(result.source).toBe('metadata') }) it('should extract title from YAML frontmatter with single quotes', () => { const text = "---\ntitle: 'My Single Quoted Document'\n---\n\nContent here." const result = extractMarkdownTitle(text, 'test.md') expect(result.title).toBe('My Single Quoted Document') expect(result.source).toBe('metadata') }) it('should extract first H1 heading when no frontmatter', () => { const text = '# My Title\n\nContent here.' const result = extractMarkdownTitle(text, 'test.md') expect(result.title).toBe('My Title') expect(result.source).toBe('content') }) it('should prefer frontmatter over H1', () => { const text = '---\ntitle: Frontmatter Title\n---\n\n# Heading Title\n\nContent here.' const result = extractMarkdownTitle(text, 'test.md') expect(result.title).toBe('Frontmatter Title') expect(result.source).toBe('metadata') }) it('should fall back to file name when no title found', () => { const text = 'Just some plain text without any title markers.' const result = extractMarkdownTitle(text, 'my-notes.md') expect(result.title).toBe('my notes') expect(result.source).toBe('filename') }) it('should return source metadata for frontmatter, content for H1, filename for fallback', () => { const frontmatter = extractMarkdownTitle('---\ntitle: Test\n---\n', 'test.md') expect(frontmatter.source).toBe('metadata') const h1 = extractMarkdownTitle('# Test\n', 'test.md') expect(h1.source).toBe('content') const fallback = extractMarkdownTitle('no title here', 'test.md') expect(fallback.source).toBe('filename') }) }) // -------------------------------------------- // extractTxtTitle // -------------------------------------------- describe('extractTxtTitle', () => { it('should extract first line as title when followed by empty line', () => { const text = 'Document Title\n\nThis is the body text.' const result = extractTxtTitle(text, 'document.txt') expect(result.title).toBe('Document Title') expect(result.source).toBe('content') }) it('should fall back to file name when first line has no empty line after', () => { const text = 'Line one\nLine two\nLine three' const result = extractTxtTitle(text, 'my-notes.txt') expect(result.title).toBe('my notes') expect(result.source).toBe('filename') }) it('should fall back to file name for empty text', () => { const result = extractTxtTitle('', 'empty-file.txt') expect(result.title).toBe('empty file') expect(result.source).toBe('filename') }) }) // -------------------------------------------- // extractHtmlTitle // -------------------------------------------- describe('extractHtmlTitle', () => { it('should use readability title when available', () => { const result = extractHtmlTitle('Article Title', 'page.html') expect(result.title).toBe('Article Title') expect(result.source).toBe('content') }) it('should fall back to file name when readability title is empty', () => { const result = extractHtmlTitle('', 'my-page.html') expect(result.title).toBe('my page') expect(result.source).toBe('filename') }) it('should fall back to file name when readability title is whitespace only', () => { const result = extractHtmlTitle(' ', 'my-page.html') expect(result.title).toBe('my page') expect(result.source).toBe('filename') }) }) // -------------------------------------------- // extractPdfTitle // -------------------------------------------- describe('extractPdfTitle', () => { it('should use PDF metadata title when available', () => { const result = extractPdfTitle('Annual Report 2024', 'Some chunk text', 'report.pdf') expect(result.title).toBe('Annual Report 2024') expect(result.source).toBe('metadata') }) it('should use first page chunk text when no metadata title', () => { const result = extractPdfTitle(undefined, 'The Unity Game Designer Playbook', 'report.pdf') expect(result.title).toBe('The Unity Game Designer Playbook') expect(result.source).toBe('content') }) it('should fall back to file name when no metadata and no chunk text', () => { const result = extractPdfTitle(undefined, undefined, 'annual-report.pdf') expect(result.title).toBe('annual report') expect(result.source).toBe('filename') }) it('should ignore metadata title if it looks like a file path', () => { const result = extractPdfTitle('/home/user/document.pdf', undefined, 'my-doc.pdf') expect(result.title).toBe('my doc') expect(result.source).toBe('filename') }) it('should ignore metadata title if it contains backslash path', () => { const result = extractPdfTitle('C:\\Users\\doc.pdf', undefined, 'my-doc.pdf') expect(result.title).toBe('my doc') expect(result.source).toBe('filename') }) it('should ignore metadata title if it is empty or whitespace', () => { const result = extractPdfTitle(' ', undefined, 'my-doc.pdf') expect(result.title).toBe('my doc') expect(result.source).toBe('filename') }) it('should prefer metadata over chunk text when both available', () => { const result = extractPdfTitle('Metadata Title', 'Chunk Title', 'fallback.pdf') expect(result.title).toBe('Metadata Title') expect(result.source).toBe('metadata') }) it('should fall back from file-path metadata to chunk text', () => { const result = extractPdfTitle('/usr/local/doc.pdf', 'Real Title From Content', 'my-doc.pdf') expect(result.title).toBe('Real Title From Content') expect(result.source).toBe('content') }) }) // -------------------------------------------- // extractDocxTitle // -------------------------------------------- describe('extractDocxTitle', () => { it('should extract first h1 from mammoth HTML output', () => { const html = '<h1>Document Title</h1><p>Some content here.</p>' const result = extractDocxTitle(html, 'document.docx') expect(result.title).toBe('Document Title') expect(result.source).toBe('content') }) it('should fall back to file name when no h1 found', () => { const html = '<p>Some content without heading.</p>' const result = extractDocxTitle(html, 'my-document.docx') expect(result.title).toBe('my document') expect(result.source).toBe('filename') }) it('should handle HTML with no heading tags', () => { const html = '<p>Just a paragraph.</p><p>Another paragraph.</p>' const result = extractDocxTitle(html, 'notes.docx') expect(result.title).toBe('notes') expect(result.source).toBe('filename') }) it('should extract only the first h1 when multiple exist', () => { const html = '<h1>First Title</h1><h1>Second Title</h1><p>Content.</p>' const result = extractDocxTitle(html, 'document.docx') expect(result.title).toBe('First Title') expect(result.source).toBe('content') }) }) })

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

title-extractor.test.ts•9.12 KiB