Skip to main content
Glama
deduplication.integration.test.ts6.95 kB
import { deduplicatePapers, type PaperLike } from "../deduplication.js"; /** * Integration tests for deduplication with real-world scenarios * Note: These tests may require network access and could be slow */ describe("Deduplication Integration Tests", () => { test("Test 6: Google Scholar real data - verify low duplicate rate", () => { // Simulate Google Scholar results with duplicates const papers: PaperLike[] = [ { title: "CRISPR gene therapy for cancer treatment", authors: "Smith, J.", year: "2024", journal: "Nature Medicine", }, { title: "CRISPR gene therapy for cancer treatment", authors: "Smith, J.", year: "2024", journal: "Nature Medicine", doi: "10.1234/nature.2024.001", }, { title: "CRISPR gene therapy for cancer treatment - Preprint", authors: "Smith, J.", year: "2024", }, { title: "CRISPR-Based Gene Therapy in Cancer", authors: "Smith, J.", year: "2024", doi: "10.1234/nature.2024.001", }, { title: "Novel approaches to diabetes treatment", authors: "Jones, M.", year: "2024", }, { title: "Novel approaches to diabetes treatment", authors: "Jones, M.", year: "2024", }, ]; const result = deduplicatePapers(papers); // Should have < 5% duplicate rate (6 papers -> should have at least 2 unique) const duplicateRate = result.duplicatesRemoved / result.totalResults; expect(duplicateRate).toBeLessThan(0.5); // Less than 50% duplicates expect(result.uniqueResults).toBeGreaterThanOrEqual(2); expect(result.uniqueResults).toBeLessThanOrEqual(3); // Should deduplicate to 2-3 unique papers }); test("Test 7: Cross-source deduplication - same paper from PubMed + Google Scholar", () => { // Simulate same paper from different sources const pubmedPaper: PaperLike = { title: "Treatment of Type 2 Diabetes with Metformin", authors: ["Smith, J.", "Jones, M."], publication_date: "2024-01-15", doi: "10.1234/pubmed.2024.001", journal: "New England Journal of Medicine", abstract: "This study examines...", }; const scholarPaper: PaperLike = { title: "Treatment of Type 2 Diabetes with Metformin", authors: "Smith, J., Jones, M.", year: "2024", doi: "10.1234/pubmed.2024.001", journal: "NEJM", abstract: "This study examines...", }; const papers = [pubmedPaper, scholarPaper]; const result = deduplicatePapers(papers); // Should deduplicate to 1 paper (same DOI) expect(result.uniqueResults).toBe(1); expect(result.duplicatesRemoved).toBe(1); expect(result.papers[0].doi).toBe("10.1234/pubmed.2024.001"); }); test("Test 7b: Cross-source without DOI - match by title+author+year", () => { const pubmedPaper: PaperLike = { title: "CRISPR Applications in Oncology", authors: ["Brown, A."], publication_date: "2023-06-01", journal: "Cancer Research", }; const scholarPaper: PaperLike = { title: "CRISPR Applications in Oncology", authors: "Brown, A.", year: "2023", journal: "Cancer Research", }; const papers = [pubmedPaper, scholarPaper]; const result = deduplicatePapers(papers); // Should match by title + author + year expect(result.uniqueResults).toBe(1); expect(result.duplicatesRemoved).toBe(1); }); test("Test 3: Performance test - 100+ papers, verify <50ms overhead", () => { // Generate 100 papers with some duplicates const papers: PaperLike[] = []; for (let i = 0; i < 50; i++) { papers.push({ title: `Paper ${i}`, authors: `Author ${i}`, year: "2024", }); // Add duplicate papers.push({ title: `Paper ${i}`, authors: `Author ${i}`, year: "2024", }); } const startTime = Date.now(); const result = deduplicatePapers(papers); const endTime = Date.now(); const duration = endTime - startTime; // Should complete in reasonable time (< 50ms for 100 papers) expect(duration).toBeLessThan(100); // Allow some buffer expect(result.uniqueResults).toBe(50); expect(result.duplicatesRemoved).toBe(50); }); test("Real-world scenario: Multiple versions of same paper", () => { const papers: PaperLike[] = [ { title: "Gene Therapy for Cancer: A Comprehensive Review", authors: "Smith, J.", year: "2024", journal: "Nature", doi: "10.1234/nature.2024.001", }, { title: "Gene Therapy for Cancer: A Comprehensive Review [Preprint]", authors: "Smith, J.", year: "2024", }, { title: "Gene Therapy for Cancer: A Comprehensive Review - Version 2", authors: "Smith, J.", year: "2024", doi: "10.1234/nature.2024.001", }, { title: "Gene Therapy for Cancer: A Comprehensive Review", authors: "Smith, J.", year: "2024", journal: "Nature", doi: "10.1234/nature.2024.001", }, { title: "Different Paper on Cancer", authors: "Jones, M.", year: "2024", }, ]; const result = deduplicatePapers(papers); // Should deduplicate multiple versions to 1, keep different paper expect(result.uniqueResults).toBe(2); // 1 unique review + 1 different paper expect(result.duplicatesRemoved).toBe(3); // Should keep the version with DOI const reviewPaper = result.papers.find((p) => p.title.includes("Comprehensive Review"), ); expect(reviewPaper?.doi).toBe("10.1234/nature.2024.001"); }); test("Edge case: Papers with missing metadata", () => { const papers: PaperLike[] = [ { title: "Paper A", authors: "Smith" }, // No year { title: "Paper A", authors: "Smith", year: "2024" }, { title: "Paper B", year: "2024" }, // No author { title: "Paper B", year: "2024" }, { title: "Paper C" }, // No author, no year { title: "Paper C" }, ]; const result = deduplicatePapers(papers); // Should handle missing metadata gracefully expect(result.uniqueResults).toBeLessThanOrEqual(3); expect(result.uniqueResults).toBeGreaterThanOrEqual(2); }); test("Edge case: Very similar but different papers", () => { const papers: PaperLike[] = [ { title: "CRISPR Applications in Cancer Treatment", authors: "Smith, J.", year: "2024", }, { title: "CRISPR Applications in Diabetes Treatment", authors: "Smith, J.", year: "2024", }, ]; const result = deduplicatePapers(papers); // Should NOT deduplicate - different topics expect(result.uniqueResults).toBe(2); expect(result.duplicatesRemoved).toBe(0); }); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/JamesANZ/medical-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server