import { z } from "zod";
import { SequenceGenerator } from "../utils/sequenceUtils.js";
export const simulateFastq = {
definition: {
name: "simulate_fastq_file",
description: "Simulate FASTQ sequencing reads with realistic quality scores and error models based on NEAT methodology. Implementation inspired by: Stephens et al. (2016) 'Simulating Next-Generation Sequencing Datasets from Empirical Mutation and Sequencing Models.' PLOS ONE 11(11): e0167047. https://doi.org/10.1371/journal.pone.0167047",
inputSchema: {
type: "object",
properties: {
referenceSequence: {
type: "string",
description: "Reference DNA sequence to generate reads from"
},
readLength: {
type: "number",
description: "Length of each sequencing read",
minimum: 50,
maximum: 300
},
coverage: {
type: "number",
description: "Target sequencing coverage depth",
minimum: 1,
maximum: 1000
},
readType: {
type: "string",
description: "Type of reads to generate",
enum: ["single-end", "paired-end"]
},
insertSize: {
type: "number",
description: "Mean insert size for paired-end reads (ignored for single-end)",
minimum: 100,
maximum: 1000
},
insertSizeStd: {
type: "number",
description: "Standard deviation of insert size for paired-end reads",
minimum: 10,
maximum: 200
},
errorRate: {
type: "number",
description: "Base calling error rate (0-1)",
minimum: 0,
maximum: 0.1
},
qualityModel: {
type: "string",
description: "Quality score model to use",
enum: ["illumina", "454", "ion-torrent", "pacbio"]
},
mutationRate: {
type: "number",
description: "Rate of true mutations to introduce (0-1)",
minimum: 0,
maximum: 0.05
},
seed: {
type: "number",
description: "Random seed for reproducible results (optional)"
},
outputFormat: {
type: "string",
description: "Output format for the reads",
enum: ["fastq", "json"]
}
},
required: ["referenceSequence", "readLength", "coverage"]
},
},
async handler({
referenceSequence,
readLength,
coverage,
readType = "single-end",
insertSize = 300,
insertSizeStd = 50,
errorRate = 0.01,
qualityModel = "illumina",
mutationRate = 0.001,
seed,
outputFormat = "fastq"
}: {
referenceSequence: string;
readLength: number;
coverage: number;
readType?: string;
insertSize?: number;
insertSizeStd?: number;
errorRate?: number;
qualityModel?: string;
mutationRate?: number;
seed?: number;
outputFormat?: string;
}) {
const generator = new SequenceGenerator(seed);
const refLength = referenceSequence.length;
// Calculate number of reads needed
const totalBasesNeeded = refLength * coverage;
const effectiveReadLength = readType === "paired-end" ? readLength * 2 : readLength;
const numReads = Math.ceil(totalBasesNeeded / effectiveReadLength);
const reads: any[] = [];
for (let i = 0; i < numReads; i++) {
if (readType === "single-end") {
const read = generateSingleEndRead(
referenceSequence,
readLength,
errorRate,
mutationRate,
qualityModel,
generator,
i + 1
);
reads.push(read);
} else {
const readPair = generatePairedEndReads(
referenceSequence,
readLength,
insertSize,
insertSizeStd,
errorRate,
mutationRate,
qualityModel,
generator,
i + 1
);
reads.push(readPair);
}
}
const stats = {
totalReads: readType === "paired-end" ? reads.length * 2 : reads.length,
readLength,
coverage: Math.round((reads.length * effectiveReadLength / refLength) * 100) / 100,
readType,
errorRate,
mutationRate,
qualityModel,
seed: seed || "random",
citation: "Stephens et al. (2016) 'Simulating Next-Generation Sequencing Datasets from Empirical Mutation and Sequencing Models.' PLOS ONE 11(11): e0167047. DOI: 10.1371/journal.pone.0167047"
};
let output = '';
if (outputFormat === 'fastq') {
if (readType === "single-end") {
output = reads.map(read => formatFastqRecord(read)).join('\n');
} else {
output = reads.map(pair =>
formatFastqRecord(pair.read1) + '\n' + formatFastqRecord(pair.read2)
).join('\n');
}
} else {
output = JSON.stringify(reads, null, 2);
}
return {
content: [{
type: "text",
text: JSON.stringify({
statistics: stats,
reads: outputFormat === 'fastq' ? undefined : reads,
fastqOutput: outputFormat === 'fastq' ? output : undefined
}, null, 2)
}]
};
}
};
function generateSingleEndRead(
reference: string,
readLength: number,
errorRate: number,
mutationRate: number,
qualityModel: string,
generator: SequenceGenerator,
readId: number
): any {
// Random start position
const maxStart = Math.max(0, reference.length - readLength);
const startPos = Math.floor(generator.rng() * (maxStart + 1));
const endPos = Math.min(startPos + readLength, reference.length);
let sequence = reference.substring(startPos, endPos);
// Apply mutations first (true biological variants)
sequence = applyMutations(sequence, mutationRate, generator);
// Apply sequencing errors
const { errorSequence, qualities } = applySequencingErrors(
sequence,
errorRate,
qualityModel,
generator
);
return {
id: `sim_read_${readId}`,
sequence: errorSequence,
qualities,
startPos,
endPos: startPos + errorSequence.length,
strand: "+",
mutations: sequence !== reference.substring(startPos, endPos),
errors: errorSequence !== sequence
};
}
function generatePairedEndReads(
reference: string,
readLength: number,
insertSize: number,
insertSizeStd: number,
errorRate: number,
mutationRate: number,
qualityModel: string,
generator: SequenceGenerator,
readId: number
): any {
// Generate insert size with normal distribution approximation
const actualInsertSize = Math.max(
readLength * 2,
Math.round(insertSize + (generator.rng() - 0.5) * 2 * insertSizeStd)
);
const maxStart = Math.max(0, reference.length - actualInsertSize);
const fragmentStart = Math.floor(generator.rng() * (maxStart + 1));
const fragmentEnd = Math.min(fragmentStart + actualInsertSize, reference.length);
// Read 1 (forward)
const read1Start = fragmentStart;
const read1End = Math.min(read1Start + readLength, reference.length);
let read1Seq = reference.substring(read1Start, read1End);
// Read 2 (reverse)
const read2End = fragmentEnd;
const read2Start = Math.max(read2End - readLength, 0);
let read2Seq = reverseComplement(reference.substring(read2Start, read2End));
// Apply mutations and errors to both reads
read1Seq = applyMutations(read1Seq, mutationRate, generator);
read2Seq = applyMutations(read2Seq, mutationRate, generator);
const read1Result = applySequencingErrors(read1Seq, errorRate, qualityModel, generator);
const read2Result = applySequencingErrors(read2Seq, errorRate, qualityModel, generator);
return {
read1: {
id: `sim_read_${readId}/1`,
sequence: read1Result.errorSequence,
qualities: read1Result.qualities,
startPos: read1Start,
endPos: read1End,
strand: "+",
insertSize: actualInsertSize
},
read2: {
id: `sim_read_${readId}/2`,
sequence: read2Result.errorSequence,
qualities: read2Result.qualities,
startPos: read2Start,
endPos: read2End,
strand: "-",
insertSize: actualInsertSize
}
};
}
function applyMutations(sequence: string, mutationRate: number, generator: SequenceGenerator): string {
const bases = ['A', 'T', 'G', 'C'];
let mutated = sequence.split('');
for (let i = 0; i < mutated.length; i++) {
if (generator.rng() < mutationRate) {
const currentBase = mutated[i];
const otherBases = bases.filter(b => b !== currentBase);
mutated[i] = otherBases[Math.floor(generator.rng() * otherBases.length)];
}
}
return mutated.join('');
}
function applySequencingErrors(
sequence: string,
errorRate: number,
qualityModel: string,
generator: SequenceGenerator
): { errorSequence: string; qualities: string } {
const bases = ['A', 'T', 'G', 'C'];
let errorSequence = sequence.split('');
let qualities = '';
for (let i = 0; i < sequence.length; i++) {
let hasError = generator.rng() < errorRate;
let quality: number;
if (hasError) {
// Apply error
const currentBase = errorSequence[i];
const otherBases = bases.filter(b => b !== currentBase);
errorSequence[i] = otherBases[Math.floor(generator.rng() * otherBases.length)];
// Lower quality for error positions
quality = generateQualityScore(qualityModel, generator, true);
} else {
// No error, higher quality
quality = generateQualityScore(qualityModel, generator, false);
}
qualities += String.fromCharCode(33 + quality);
}
return {
errorSequence: errorSequence.join(''),
qualities
};
}
function generateQualityScore(qualityModel: string, generator: SequenceGenerator, hasError: boolean): number {
let quality: number;
switch (qualityModel) {
case "illumina":
if (hasError) {
quality = Math.floor(generator.rng() * 20) + 2; // Q2-Q22
} else {
quality = Math.floor(generator.rng() * 20) + 20; // Q20-Q40
}
break;
case "454":
if (hasError) {
quality = Math.floor(generator.rng() * 15) + 2; // Q2-Q17
} else {
quality = Math.floor(generator.rng() * 20) + 15; // Q15-Q35
}
break;
case "ion-torrent":
if (hasError) {
quality = Math.floor(generator.rng() * 10) + 2; // Q2-Q12
} else {
quality = Math.floor(generator.rng() * 15) + 10; // Q10-Q25
}
break;
case "pacbio":
if (hasError) {
quality = Math.floor(generator.rng() * 8) + 2; // Q2-Q10
} else {
quality = Math.floor(generator.rng() * 10) + 8; // Q8-Q18
}
break;
default:
quality = hasError ? 10 : 30;
}
return Math.min(93, Math.max(0, quality)); // Clamp to valid FASTQ range
}
function reverseComplement(sequence: string): string {
const complement: Record<string, string> = { 'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G' };
return sequence
.split('')
.reverse()
.map(base => complement[base] || base)
.join('');
}
function formatFastqRecord(read: any): string {
return `@${read.id}\n${read.sequence}\n+\n${read.qualities}`;
}