deduplicate_strings
Remove duplicate strings and select a diverse subset using Jina embeddings and submodular optimization. Ideal for filtering similar content, extracting representative samples, or ensuring semantic uniqueness.
Instructions
Get top-k semantically unique strings from a list using Jina embeddings and submodular optimization. Use this when you have many similar strings and want to select the most diverse subset that covers the semantic space. Perfect for removing duplicates, selecting representative samples, or finding diverse content.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| k | No | Number of unique strings to return. If not provided, automatically finds optimal k by looking at diminishing return | |
| strings | Yes | Array of strings to deduplicate |
Input Schema (JSON Schema)
{
"$schema": "http://json-schema.org/draft-07/schema#",
"additionalProperties": false,
"properties": {
"k": {
"description": "Number of unique strings to return. If not provided, automatically finds optimal k by looking at diminishing return",
"type": "number"
},
"strings": {
"description": "Array of strings to deduplicate",
"items": {
"type": "string"
},
"type": "array"
}
},
"required": [
"strings"
],
"type": "object"
}
Implementation Reference
- src/tools/jina-tools.ts:601-692 (handler)The core handler and registration for the 'deduplicate_strings' tool. Includes Zod input schema, API call to Jina embeddings, submodular selection logic invocation, and response formatting.server.tool( "deduplicate_strings", "Get top-k semantically unique strings from a list using Jina embeddings and submodular optimization. Use this when you have many similar strings and want to select the most diverse subset that covers the semantic space. Perfect for removing duplicates, selecting representative samples, or finding diverse content.", { strings: z.array(z.string()).describe("Array of strings to deduplicate"), k: z.number().optional().describe("Number of unique strings to return. If not provided, automatically finds optimal k by looking at diminishing return") }, async ({ strings, k }: { strings: string[]; k?: number }) => { try { const props = getProps(); const tokenError = checkBearerToken(props.bearerToken); if (tokenError) { return tokenError; } if (strings.length === 0) { throw new Error("No strings provided for deduplication"); } if (k !== undefined && (k <= 0 || k > strings.length)) { throw new Error(`Invalid k value: ${k}. Must be between 1 and ${strings.length}`); } // Get embeddings from Jina API const response = await fetch('https://api.jina.ai/v1/embeddings', { method: 'POST', headers: { 'Accept': 'application/json', 'Content-Type': 'application/json', 'Authorization': `Bearer ${props.bearerToken}`, }, body: JSON.stringify({ model: 'jina-embeddings-v3', task: 'text-matching', input: strings }), }); if (!response.ok) { return handleApiError(response, "Getting embeddings"); } const data = await response.json() as any; if (!data.data || !Array.isArray(data.data)) { throw new Error("Invalid response format from embeddings API"); } // Extract embeddings const embeddings = data.data.map((item: any) => item.embedding); // Use submodular optimization to select diverse strings let selectedIndices: number[]; let optimalK: number; let values: number[]; if (k !== undefined) { // Use specified k selectedIndices = lazyGreedySelection(embeddings, k); values = []; } else { // Automatically find optimal k using saturation point const result = lazyGreedySelectionWithSaturation(embeddings); selectedIndices = result.selected; values = result.values; } // Get the selected strings const selectedStrings = selectedIndices.map(idx => ({ index: idx, text: strings[idx] })); // Return each deduplicated string as individual text items for consistency const contentItems: Array<{ type: 'text'; text: string }> = []; for (const selectedString of selectedStrings) { contentItems.push({ type: "text" as const, text: yamlStringify(selectedString), }); } return { content: contentItems, }; } catch (error) { return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`); } }, );
- src/tools/jina-tools.ts:604-607 (schema)Zod schema for tool inputs: array of strings and optional k for number of unique outputs.{ strings: z.array(z.string()).describe("Array of strings to deduplicate"), k: z.number().optional().describe("Number of unique strings to return. If not provided, automatically finds optimal k by looking at diminishing return") },
- Helper functions for submodular greedy selection: cosineSimilarity, computeMarginalGainDiversity, lazyGreedySelection (fixed k), lazyGreedySelectionWithSaturation (auto k via saturation detection). Used to select semantically diverse strings from embeddings.// Submodular optimization utilities for string deduplication export function cosineSimilarity(a: number[], b: number[]): number { if (a.length !== b.length) return 0; let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } if (normA === 0 || normB === 0) return 0; return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); } export function computeMarginalGainDiversity( newIdx: number, currentCoverage: number[], similarityMatrix: number[][] ): number { const n = similarityMatrix.length; let marginalGain = 0; const row = similarityMatrix[newIdx]; for (let i = 0; i < n; i++) { const newCoverage = row[i] > currentCoverage[i] ? row[i] : currentCoverage[i]; marginalGain += newCoverage - currentCoverage[i]; } return marginalGain; } export function lazyGreedySelection(embeddings: number[][], k: number): number[] { const n = embeddings.length; if (k >= n) return Array.from({ length: n }, (_, i) => i); const selected: number[] = []; const remaining = new Set(Array.from({ length: n }, (_, i) => i)); // Pre-compute similarity matrix const similarityMatrix: number[][] = []; for (let i = 0; i < n; i++) { similarityMatrix[i] = []; for (let j = 0; j < n; j++) { // Clamp to non-negative to ensure monotone submodularity of facility-location objective const sim = cosineSimilarity(embeddings[i], embeddings[j]); similarityMatrix[i][j] = sim > 0 ? sim : 0; } } // Maintain current coverage vector (max similarity to selected set for each element) const currentCoverage = new Array(n).fill(0); // Priority queue implementation using array (simplified) const pq: Array<[number, number, number]> = []; // Initialize priority queue for (let i = 0; i < n; i++) { const gain = computeMarginalGainDiversity(i, currentCoverage, similarityMatrix); pq.push([-gain, 0, i]); } // Sort by gain (descending) pq.sort((a, b) => a[0] - b[0]); for (let iteration = 0; iteration < k; iteration++) { while (pq.length > 0) { const [negGain, lastUpdated, bestIdx] = pq.shift()!; if (!remaining.has(bestIdx)) continue; if (lastUpdated === iteration) { selected.push(bestIdx); remaining.delete(bestIdx); // Update coverage in O(n) const row = similarityMatrix[bestIdx]; for (let i = 0; i < n; i++) { if (row[i] > currentCoverage[i]) currentCoverage[i] = row[i]; } break; } const currentGain = computeMarginalGainDiversity(bestIdx, currentCoverage, similarityMatrix); pq.push([-currentGain, iteration, bestIdx]); pq.sort((a, b) => a[0] - b[0]); } } return selected; } export function lazyGreedySelectionWithSaturation( embeddings: number[][], threshold: number = 1e-2 ): { selected: number[], optimalK: number, values: number[] } { const n = embeddings.length; const selected: number[] = []; const remaining = new Set(Array.from({ length: n }, (_, i) => i)); const values: number[] = []; // Pre-compute similarity matrix const similarityMatrix: number[][] = []; for (let i = 0; i < n; i++) { similarityMatrix[i] = []; for (let j = 0; j < n; j++) { const sim = cosineSimilarity(embeddings[i], embeddings[j]); similarityMatrix[i][j] = sim > 0 ? sim : 0; } } const currentCoverage = new Array(n).fill(0); // Priority queue implementation using array (simplified) const pq: Array<[number, number, number]> = []; // Initialize priority queue for (let i = 0; i < n; i++) { const gain = computeMarginalGainDiversity(i, currentCoverage, similarityMatrix); pq.push([-gain, 0, i]); } // Sort by gain (descending) pq.sort((a, b) => a[0] - b[0]); let earlyStopK: number | null = null; for (let iteration = 0; iteration < n; iteration++) { while (pq.length > 0) { const [negGain, lastUpdated, bestIdx] = pq.shift()!; if (!remaining.has(bestIdx)) continue; if (lastUpdated === iteration) { selected.push(bestIdx); remaining.delete(bestIdx); // Compute current function value (coverage) const row = similarityMatrix[bestIdx]; for (let i = 0; i < n; i++) { if (row[i] > currentCoverage[i]) currentCoverage[i] = row[i]; } const functionValue = currentCoverage.reduce((sum, val) => sum + val, 0) / n; values.push(functionValue); // Early stop when the marginal gain (delta of normalized objective) falls below threshold if (values.length >= 2) { const delta = values[values.length - 1] - values[values.length - 2]; if (delta < threshold) { earlyStopK = values.length; // k is count of selected items } } break; } const currentGain = computeMarginalGainDiversity(bestIdx, currentCoverage, similarityMatrix); pq.push([-currentGain, iteration, bestIdx]); pq.sort((a, b) => a[0] - b[0]); } if (earlyStopK !== null) break; } // Choose k: prefer early stop detection; otherwise, use all collected values const optimalK = earlyStopK ?? values.length; const finalSelected = selected.slice(0, optimalK); return { selected: finalSelected, optimalK, values }; }
- src/utils/api-error-handler.ts:56-69 (helper)checkBearerToken helper: validates presence of Jina API bearer token, returns error response if missing.export function checkBearerToken(bearerToken: string | undefined) { if (!bearerToken) { return { content: [ { type: "text" as const, text: "Please set your API key in the Jina AI MCP settings. You can get a free API key by visiting https://jina.ai and signing up for an account.", }, ], isError: true, }; } return null; // No error, token is available }
- src/utils/api-error-handler.ts:5-51 (helper)handleApiError helper: standardizes error responses for API failures like 401, 402, 429, used for embeddings fetch.export function handleApiError(response: Response, context: string = "API request") { if (response.status === 401) { return { content: [ { type: "text" as const, text: "Authentication failed. Please set your API key in the Jina AI MCP settings. You can get a free API key by visiting https://jina.ai and signing up for an account.", }, ], isError: true, }; } if (response.status === 402) { return { content: [ { type: "text" as const, text: "This key is out of quota. Please top up this key at https://jina.ai", }, ], isError: true, }; } if (response.status === 429) { return { content: [ { type: "text" as const, text: "Rate limit exceeded. Please upgrade your API key to get higher rate limits. Visit https://jina.ai to manage your subscription and increase your usage limits.", }, ], isError: true, }; } // Default error message for other HTTP errors return { content: [ { type: "text" as const, text: `Error: ${context} failed - ${response.status} ${response.statusText}`, }, ], isError: true, }; }