import { z } from "zod";
import { SequenceGenerator } from "../utils/sequenceUtils.js";
export const simulatePhylogeny = {
definition: {
name: "simulate_phylogeny",
description: "Simulate phylogenetic tree and sequence evolution",
inputSchema: {
type: "object",
properties: {
rootSequence: {
type: "string",
description: "Root sequence for phylogenetic simulation"
},
treeStructure: {
type: "string",
description: "Newick format tree or 'random' for random tree",
},
numTaxa: {
type: "number",
description: "Number of taxa for random tree generation",
minimum: 2
},
mutationRate: {
type: "number",
description: "Mutation rate per branch length unit",
minimum: 0
},
branchLengthVariation: {
type: "number",
description: "Variation in branch lengths (0-1)",
minimum: 0,
maximum: 1
},
molecularClock: {
type: "boolean",
description: "Use molecular clock (uniform rates)",
default: true
},
substitutionModel: {
type: "string",
description: "Substitution model: 'JC69', 'K80', 'HKY85', or 'GTR'",
enum: ["JC69", "K80", "HKY85", "GTR"]
},
seed: {
type: "number",
description: "Random seed for reproducible results (optional)"
},
outputFormat: {
type: "string",
description: "Output format: 'fasta', 'nexus', or 'phylip'",
enum: ["fasta", "nexus", "phylip"]
}
},
required: ["rootSequence"]
},
},
async handler({
rootSequence,
treeStructure = "random",
numTaxa = 5,
mutationRate = 0.1,
branchLengthVariation = 0.2,
molecularClock = true,
substitutionModel = "JC69",
seed,
outputFormat = "fasta"
}: {
rootSequence: string;
treeStructure?: string;
numTaxa?: number;
mutationRate?: number;
branchLengthVariation?: number;
molecularClock?: boolean;
substitutionModel?: string;
seed?: number;
outputFormat?: string;
}) {
const generator = new SequenceGenerator(seed);
const isDNA = /^[ATGC]+$/i.test(rootSequence.replace(/\s/g, ''));
let tree: PhyloNode;
if (treeStructure === "random") {
tree = generateRandomTree(numTaxa, generator);
} else {
tree = parseNewick(treeStructure);
}
if (!molecularClock) {
addBranchLengthVariation(tree, branchLengthVariation, generator);
}
const sequences = evolveSequencesOnTree(tree, rootSequence, mutationRate, substitutionModel, generator, isDNA);
let output = '';
const taxaNames = Object.keys(sequences);
switch (outputFormat) {
case 'fasta':
output = Object.entries(sequences).map(([name, seq]) =>
`>${name}\n${seq}`
).join('\n\n');
break;
case 'nexus':
output = generateNexusFormat(sequences, tree);
break;
case 'phylip':
output = generatePhylipFormat(sequences);
break;
}
const treeStats = calculateTreeStats(tree);
const sequenceStats = calculateSequenceDivergence(sequences, rootSequence);
return {
content: [{
type: "text",
text: JSON.stringify({
parameters: {
numTaxa: taxaNames.length,
mutationRate,
substitutionModel,
molecularClock,
seed: seed || "random"
},
treeStatistics: treeStats,
sequenceStatistics: sequenceStats,
newickTree: treeToNewick(tree),
sequences: outputFormat === 'fasta' ? sequences : undefined,
formattedOutput: output
}, null, 2)
}]
};
}
};
interface PhyloNode {
name?: string;
length: number;
children: PhyloNode[];
sequence?: string;
}
function generateRandomTree(numTaxa: number, generator: SequenceGenerator): PhyloNode {
const taxa: PhyloNode[] = Array.from({ length: numTaxa }, (_, i) => ({
name: `Taxon_${i + 1}`,
length: 0,
children: []
}));
let nodes = [...taxa];
let nodeCounter = numTaxa + 1;
while (nodes.length > 1) {
const i = Math.floor(Math.random() * nodes.length);
const j = Math.floor(Math.random() * (nodes.length - 1));
const actualJ = j >= i ? j + 1 : j;
const child1 = nodes[i];
const child2 = nodes[actualJ];
child1.length = Math.random() * 0.5 + 0.1;
child2.length = Math.random() * 0.5 + 0.1;
const parent: PhyloNode = {
name: `Node_${nodeCounter++}`,
length: 0,
children: [child1, child2]
};
nodes = nodes.filter((_, idx) => idx !== i && idx !== actualJ);
nodes.push(parent);
}
return nodes[0];
}
function parseNewick(newick: string): PhyloNode {
newick = newick.trim().replace(/;$/, '');
const parseNode = (str: string, start: number): { node: PhyloNode, end: number } => {
let i = start;
const node: PhyloNode = { length: 0, children: [] };
if (str[i] === '(') {
i++;
while (str[i] !== ')') {
const { node: child, end } = parseNode(str, i);
node.children.push(child);
i = end;
if (str[i] === ',') i++;
}
i++;
}
let name = '';
while (i < str.length && str[i] !== ',' && str[i] !== ')' && str[i] !== ':') {
name += str[i];
i++;
}
if (name) node.name = name;
if (str[i] === ':') {
i++;
let lengthStr = '';
while (i < str.length && str[i] !== ',' && str[i] !== ')') {
lengthStr += str[i];
i++;
}
node.length = parseFloat(lengthStr) || 0;
}
return { node, end: i };
};
return parseNode(newick, 0).node;
}
function addBranchLengthVariation(node: PhyloNode, variation: number, generator: SequenceGenerator): void {
const multiplier = 1 + (Math.random() - 0.5) * 2 * variation;
node.length *= multiplier;
node.children.forEach(child => addBranchLengthVariation(child, variation, generator));
}
function evolveSequencesOnTree(
tree: PhyloNode,
rootSequence: string,
mutationRate: number,
model: string,
generator: SequenceGenerator,
isDNA: boolean
): Record<string, string> {
const sequences: Record<string, string> = {};
function evolveNode(node: PhyloNode, parentSequence: string): void {
const mutations = Math.floor(node.length * mutationRate * parentSequence.length);
let currentSequence = parentSequence;
for (let i = 0; i < mutations; i++) {
if (isDNA) {
currentSequence = mutateDNA(currentSequence, model, generator);
} else {
currentSequence = mutateProtein(currentSequence, generator);
}
}
node.sequence = currentSequence;
if (node.children.length === 0 && node.name) {
sequences[node.name] = currentSequence;
}
node.children.forEach(child => evolveNode(child, currentSequence));
}
evolveNode(tree, rootSequence);
return sequences;
}
function mutateDNA(sequence: string, model: string, generator: SequenceGenerator): string {
const pos = Math.floor(Math.random() * sequence.length);
const bases = ['A', 'T', 'G', 'C'];
const current = sequence[pos];
let newBase: string;
switch (model) {
case 'JC69':
do {
newBase = bases[Math.floor(Math.random() * 4)];
} while (newBase === current);
break;
case 'K80':
const isTransition = (current === 'A' && Math.random() < 0.67) ||
(current === 'G' && Math.random() < 0.67) ||
(current === 'T' && Math.random() < 0.67) ||
(current === 'C' && Math.random() < 0.67);
if (isTransition) {
newBase = current === 'A' ? 'G' : current === 'G' ? 'A' : current === 'T' ? 'C' : 'T';
} else {
const transversions = bases.filter(b => b !== current &&
!((current === 'A' && b === 'G') || (current === 'G' && b === 'A') ||
(current === 'T' && b === 'C') || (current === 'C' && b === 'T')));
newBase = transversions[Math.floor(Math.random() * transversions.length)];
}
break;
default:
do {
newBase = bases[Math.floor(Math.random() * 4)];
} while (newBase === current);
}
return sequence.substring(0, pos) + newBase + sequence.substring(pos + 1);
}
function mutateProtein(sequence: string, generator: SequenceGenerator): string {
const pos = Math.floor(Math.random() * sequence.length);
const aminoAcids = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'];
const current = sequence[pos];
let newAA;
do {
newAA = aminoAcids[Math.floor(Math.random() * aminoAcids.length)];
} while (newAA === current);
return sequence.substring(0, pos) + newAA + sequence.substring(pos + 1);
}
function calculateTreeStats(tree: PhyloNode): any {
let totalBranchLength = 0;
let numTaxa = 0;
let maxDepth = 0;
function traverse(node: PhyloNode, depth: number): void {
totalBranchLength += node.length;
if (node.children.length === 0) {
numTaxa++;
maxDepth = Math.max(maxDepth, depth);
}
node.children.forEach(child => traverse(child, depth + child.length));
}
traverse(tree, 0);
return {
totalBranchLength: Math.round(totalBranchLength * 1000) / 1000,
numTaxa,
maxDepth: Math.round(maxDepth * 1000) / 1000,
avgBranchLength: Math.round((totalBranchLength / (numTaxa * 2 - 2)) * 1000) / 1000
};
}
function calculateSequenceDivergence(sequences: Record<string, string>, rootSequence: string): any {
const names = Object.keys(sequences);
const divergences = names.map(name => {
const seq = sequences[name];
const diffs = seq.split('').filter((char, i) => char !== rootSequence[i]).length;
return diffs / rootSequence.length;
});
return {
avgDivergenceFromRoot: Math.round((divergences.reduce((a, b) => a + b, 0) / divergences.length) * 10000) / 100,
maxDivergenceFromRoot: Math.round(Math.max(...divergences) * 10000) / 100,
minDivergenceFromRoot: Math.round(Math.min(...divergences) * 10000) / 100
};
}
function treeToNewick(tree: PhyloNode): string {
if (tree.children.length === 0) {
return `${tree.name || ''}:${tree.length}`;
}
const childStrings = tree.children.map(child => treeToNewick(child));
return `(${childStrings.join(',')})${tree.name || ''}:${tree.length}`;
}
function generateNexusFormat(sequences: Record<string, string>, tree: PhyloNode): string {
const names = Object.keys(sequences);
const seqLength = Object.values(sequences)[0].length;
let nexus = '#NEXUS\n\n';
nexus += 'BEGIN DATA;\n';
nexus += ` DIMENSIONS NTAX=${names.length} NCHAR=${seqLength};\n`;
nexus += ' FORMAT DATATYPE=DNA GAP=- MISSING=?;\n';
nexus += ' MATRIX\n';
names.forEach(name => {
nexus += ` ${name.padEnd(15)} ${sequences[name]}\n`;
});
nexus += ' ;\nEND;\n\n';
nexus += 'BEGIN TREES;\n';
nexus += ` TREE simulated = ${treeToNewick(tree)};\n`;
nexus += 'END;\n';
return nexus;
}
function generatePhylipFormat(sequences: Record<string, string>): string {
const names = Object.keys(sequences);
const seqLength = Object.values(sequences)[0].length;
let phylip = `${names.length} ${seqLength}\n`;
names.forEach(name => {
phylip += `${name.padEnd(15)} ${sequences[name]}\n`;
});
return phylip;
}