simulate_fastq_file
Generate realistic FASTQ sequencing reads from DNA reference sequences with configurable coverage, error rates, and quality models for bioinformatics testing and research.
Instructions
Simulate FASTQ sequencing reads with realistic quality scores and error models based on NEAT methodology. Implementation inspired by: Stephens et al. (2016) 'Simulating Next-Generation Sequencing Datasets from Empirical Mutation and Sequencing Models.' PLOS ONE 11(11): e0167047. https://doi.org/10.1371/journal.pone.0167047
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| referenceSequence | Yes | Reference DNA sequence to generate reads from | |
| readLength | Yes | Length of each sequencing read | |
| coverage | Yes | Target sequencing coverage depth | |
| readType | No | Type of reads to generate | |
| insertSize | No | Mean insert size for paired-end reads (ignored for single-end) | |
| insertSizeStd | No | Standard deviation of insert size for paired-end reads | |
| errorRate | No | Base calling error rate (0-1) | |
| qualityModel | No | Quality score model to use | |
| mutationRate | No | Rate of true mutations to introduce (0-1) | |
| seed | No | Random seed for reproducible results (optional) | |
| outputFormat | No | Output format for the reads |
Implementation Reference
- src/tools/simulateFastq.ts:74-172 (handler)Main handler function for simulate_fastq_file tool. Processes input parameters, generates sequencing reads (single-end or paired-end), calculates coverage, applies mutations and sequencing errors, generates quality scores based on the selected model (illumina/454/ion-torrent/pacbio), and formats output as FASTQ or JSON.async handler({ referenceSequence, readLength, coverage, readType = "single-end", insertSize = 300, insertSizeStd = 50, errorRate = 0.01, qualityModel = "illumina", mutationRate = 0.001, seed, outputFormat = "fastq" }: { referenceSequence: string; readLength: number; coverage: number; readType?: string; insertSize?: number; insertSizeStd?: number; errorRate?: number; qualityModel?: string; mutationRate?: number; seed?: number; outputFormat?: string; }) { const generator = new SequenceGenerator(seed); const refLength = referenceSequence.length; // Calculate number of reads needed const totalBasesNeeded = refLength * coverage; const effectiveReadLength = readType === "paired-end" ? readLength * 2 : readLength; const numReads = Math.ceil(totalBasesNeeded / effectiveReadLength); const reads: any[] = []; for (let i = 0; i < numReads; i++) { if (readType === "single-end") { const read = generateSingleEndRead( referenceSequence, readLength, errorRate, mutationRate, qualityModel, generator, i + 1 ); reads.push(read); } else { const readPair = generatePairedEndReads( referenceSequence, readLength, insertSize, insertSizeStd, errorRate, mutationRate, qualityModel, generator, i + 1 ); reads.push(readPair); } } const stats = { totalReads: readType === "paired-end" ? reads.length * 2 : reads.length, readLength, coverage: Math.round((reads.length * effectiveReadLength / refLength) * 100) / 100, readType, errorRate, mutationRate, qualityModel, seed: seed || "random", citation: "Stephens et al. (2016) 'Simulating Next-Generation Sequencing Datasets from Empirical Mutation and Sequencing Models.' PLOS ONE 11(11): e0167047. DOI: 10.1371/journal.pone.0167047" }; let output = ''; if (outputFormat === 'fastq') { if (readType === "single-end") { output = reads.map(read => formatFastqRecord(read)).join('\n'); } else { output = reads.map(pair => formatFastqRecord(pair.read1) + '\n' + formatFastqRecord(pair.read2) ).join('\n'); } } else { output = JSON.stringify(reads, null, 2); } return { content: [{ type: "text", text: JSON.stringify({ statistics: stats, reads: outputFormat === 'fastq' ? undefined : reads, fastqOutput: outputFormat === 'fastq' ? output : undefined }, null, 2) }] }; }
- src/tools/simulateFastq.ts:5-73 (schema)Tool definition and input schema for simulate_fastq_file. Defines all input parameters including referenceSequence, readLength, coverage, readType, insertSize, errorRate, qualityModel, mutationRate, seed, and outputFormat with their types, constraints, and descriptions.definition: { name: "simulate_fastq_file", description: "Simulate FASTQ sequencing reads with realistic quality scores and error models based on NEAT methodology. Implementation inspired by: Stephens et al. (2016) 'Simulating Next-Generation Sequencing Datasets from Empirical Mutation and Sequencing Models.' PLOS ONE 11(11): e0167047. https://doi.org/10.1371/journal.pone.0167047", inputSchema: { type: "object", properties: { referenceSequence: { type: "string", description: "Reference DNA sequence to generate reads from" }, readLength: { type: "number", description: "Length of each sequencing read", minimum: 50, maximum: 300 }, coverage: { type: "number", description: "Target sequencing coverage depth", minimum: 1, maximum: 1000 }, readType: { type: "string", description: "Type of reads to generate", enum: ["single-end", "paired-end"] }, insertSize: { type: "number", description: "Mean insert size for paired-end reads (ignored for single-end)", minimum: 100, maximum: 1000 }, insertSizeStd: { type: "number", description: "Standard deviation of insert size for paired-end reads", minimum: 10, maximum: 200 }, errorRate: { type: "number", description: "Base calling error rate (0-1)", minimum: 0, maximum: 0.1 }, qualityModel: { type: "string", description: "Quality score model to use", enum: ["illumina", "454", "ion-torrent", "pacbio"] }, mutationRate: { type: "number", description: "Rate of true mutations to introduce (0-1)", minimum: 0, maximum: 0.05 }, seed: { type: "number", description: "Random seed for reproducible results (optional)" }, outputFormat: { type: "string", description: "Output format for the reads", enum: ["fastq", "json"] } }, required: ["referenceSequence", "readLength", "coverage"] }, },
- src/server.ts:107-120 (registration)Registration of simulate_fastq_file handler in the MCP server's CallToolRequestSchema switch statement. Routes requests to the simulateFastq.handler with type-validated arguments.case "simulate_fastq_file": return await simulateFastq.handler(args as { referenceSequence: string; readLength: number; coverage: number; readType?: string; insertSize?: number; insertSizeStd?: number; errorRate?: number; qualityModel?: string; mutationRate?: number; seed?: number; outputFormat?: string; });
- src/utils/sequenceUtils.ts:48-176 (helper)SequenceGenerator helper class providing random number generation with optional seed. Used by simulate_fastq_file for deterministic and non-deterministic simulations via the rng() method.export class SequenceGenerator { public rng: () => number; constructor(seed?: number) { if (seed !== undefined) { let s = seed; this.rng = () => { s = Math.sin(s) * 10000; return s - Math.floor(s); }; } else { this.rng = Math.random; } } generateRandomDNA(length: number, gcContent: number = 0.5): string { const gcProb = gcContent / 2; const atProb = (1 - gcContent) / 2; let sequence = ''; for (let i = 0; i < length; i++) { const rand = this.rng(); if (rand < atProb) { sequence += 'A'; } else if (rand < atProb * 2) { sequence += 'T'; } else if (rand < atProb * 2 + gcProb) { sequence += 'G'; } else { sequence += 'C'; } } return sequence; } generateRandomProtein(length: number): string { let sequence = ''; for (let i = 0; i < length; i++) { const index = Math.floor(this.rng() * AMINO_ACIDS.length); sequence += AMINO_ACIDS[index]; } return sequence; } mutateDNA(sequence: string, params: MutationParameters): string { const { substitutionRate = 0.01, insertionRate = 0.001, deletionRate = 0.001, transitionBias = 2.0 } = params; let mutated = sequence.split(''); for (let i = 0; i < mutated.length; i++) { if (this.rng() < substitutionRate) { const currentBase = mutated[i]; let newBase: string; if (this.rng() < transitionBias / (transitionBias + 1)) { newBase = this.getTransition(currentBase); } else { newBase = this.getTransversion(currentBase); } mutated[i] = newBase; } if (this.rng() < insertionRate) { const randomBase = DNA_BASES[Math.floor(this.rng() * DNA_BASES.length)]; mutated.splice(i, 0, randomBase); i++; } if (this.rng() < deletionRate && mutated.length > 1) { mutated.splice(i, 1); i--; } } return mutated.join(''); } private getTransition(base: string): string { switch (base) { case 'A': return 'G'; case 'G': return 'A'; case 'T': return 'C'; case 'C': return 'T'; default: return base; } } private getTransversion(base: string): string { const transversions: Record<string, string[]> = { 'A': ['T', 'C'], 'T': ['A', 'G'], 'G': ['T', 'C'], 'C': ['A', 'G'] }; const options = transversions[base] || [base]; return options[Math.floor(this.rng() * options.length)]; } evolveSequence(sequence: string, params: EvolutionParameters): string[] { let population = Array(params.populationSize).fill(sequence); const generations = [sequence]; for (let gen = 0; gen < params.generations; gen++) { population = population.map(seq => this.mutateDNA(seq, { substitutionRate: params.mutationRate }) ); if (params.selectionPressure) { population.sort((a, b) => this.calculateFitness(b) - this.calculateFitness(a)); population = population.slice(0, Math.floor(params.populationSize / 2)); population = population.concat(population); } generations.push(population[0]); } return generations; } private calculateFitness(sequence: string): number { const gcContent = (sequence.match(/[GC]/g) || []).length / sequence.length; return Math.abs(gcContent - 0.5); } }
- src/tools/simulateFastq.ts:175-375 (helper)Supporting helper functions for FASTQ simulation: generateSingleEndRead, generatePairedEndReads, applyMutations, applySequencingErrors, generateQualityScore, reverseComplement, and formatFastqRecord. These implement the core simulation logic including read generation, mutation application, error modeling, quality score generation, and output formatting.function generateSingleEndRead( reference: string, readLength: number, errorRate: number, mutationRate: number, qualityModel: string, generator: SequenceGenerator, readId: number ): any { // Random start position const maxStart = Math.max(0, reference.length - readLength); const startPos = Math.floor(generator.rng() * (maxStart + 1)); const endPos = Math.min(startPos + readLength, reference.length); let sequence = reference.substring(startPos, endPos); // Apply mutations first (true biological variants) sequence = applyMutations(sequence, mutationRate, generator); // Apply sequencing errors const { errorSequence, qualities } = applySequencingErrors( sequence, errorRate, qualityModel, generator ); return { id: `sim_read_${readId}`, sequence: errorSequence, qualities, startPos, endPos: startPos + errorSequence.length, strand: "+", mutations: sequence !== reference.substring(startPos, endPos), errors: errorSequence !== sequence }; } function generatePairedEndReads( reference: string, readLength: number, insertSize: number, insertSizeStd: number, errorRate: number, mutationRate: number, qualityModel: string, generator: SequenceGenerator, readId: number ): any { // Generate insert size with normal distribution approximation const actualInsertSize = Math.max( readLength * 2, Math.round(insertSize + (generator.rng() - 0.5) * 2 * insertSizeStd) ); const maxStart = Math.max(0, reference.length - actualInsertSize); const fragmentStart = Math.floor(generator.rng() * (maxStart + 1)); const fragmentEnd = Math.min(fragmentStart + actualInsertSize, reference.length); // Read 1 (forward) const read1Start = fragmentStart; const read1End = Math.min(read1Start + readLength, reference.length); let read1Seq = reference.substring(read1Start, read1End); // Read 2 (reverse) const read2End = fragmentEnd; const read2Start = Math.max(read2End - readLength, 0); let read2Seq = reverseComplement(reference.substring(read2Start, read2End)); // Apply mutations and errors to both reads read1Seq = applyMutations(read1Seq, mutationRate, generator); read2Seq = applyMutations(read2Seq, mutationRate, generator); const read1Result = applySequencingErrors(read1Seq, errorRate, qualityModel, generator); const read2Result = applySequencingErrors(read2Seq, errorRate, qualityModel, generator); return { read1: { id: `sim_read_${readId}/1`, sequence: read1Result.errorSequence, qualities: read1Result.qualities, startPos: read1Start, endPos: read1End, strand: "+", insertSize: actualInsertSize }, read2: { id: `sim_read_${readId}/2`, sequence: read2Result.errorSequence, qualities: read2Result.qualities, startPos: read2Start, endPos: read2End, strand: "-", insertSize: actualInsertSize } }; } function applyMutations(sequence: string, mutationRate: number, generator: SequenceGenerator): string { const bases = ['A', 'T', 'G', 'C']; let mutated = sequence.split(''); for (let i = 0; i < mutated.length; i++) { if (generator.rng() < mutationRate) { const currentBase = mutated[i]; const otherBases = bases.filter(b => b !== currentBase); mutated[i] = otherBases[Math.floor(generator.rng() * otherBases.length)]; } } return mutated.join(''); } function applySequencingErrors( sequence: string, errorRate: number, qualityModel: string, generator: SequenceGenerator ): { errorSequence: string; qualities: string } { const bases = ['A', 'T', 'G', 'C']; let errorSequence = sequence.split(''); let qualities = ''; for (let i = 0; i < sequence.length; i++) { let hasError = generator.rng() < errorRate; let quality: number; if (hasError) { // Apply error const currentBase = errorSequence[i]; const otherBases = bases.filter(b => b !== currentBase); errorSequence[i] = otherBases[Math.floor(generator.rng() * otherBases.length)]; // Lower quality for error positions quality = generateQualityScore(qualityModel, generator, true); } else { // No error, higher quality quality = generateQualityScore(qualityModel, generator, false); } qualities += String.fromCharCode(33 + quality); } return { errorSequence: errorSequence.join(''), qualities }; } function generateQualityScore(qualityModel: string, generator: SequenceGenerator, hasError: boolean): number { let quality: number; switch (qualityModel) { case "illumina": if (hasError) { quality = Math.floor(generator.rng() * 20) + 2; // Q2-Q22 } else { quality = Math.floor(generator.rng() * 20) + 20; // Q20-Q40 } break; case "454": if (hasError) { quality = Math.floor(generator.rng() * 15) + 2; // Q2-Q17 } else { quality = Math.floor(generator.rng() * 20) + 15; // Q15-Q35 } break; case "ion-torrent": if (hasError) { quality = Math.floor(generator.rng() * 10) + 2; // Q2-Q12 } else { quality = Math.floor(generator.rng() * 15) + 10; // Q10-Q25 } break; case "pacbio": if (hasError) { quality = Math.floor(generator.rng() * 8) + 2; // Q2-Q10 } else { quality = Math.floor(generator.rng() * 10) + 8; // Q8-Q18 } break; default: quality = hasError ? 10 : 30; } return Math.min(93, Math.max(0, quality)); // Clamp to valid FASTQ range } function reverseComplement(sequence: string): string { const complement: Record<string, string> = { 'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G' }; return sequence .split('') .reverse() .map(base => complement[base] || base) .join(''); } function formatFastqRecord(read: any): string { return `@${read.id}\n${read.sequence}\n+\n${read.qualities}`; }