/**
* Lexer for the Probabilistic Domain-Specific Language (PDSL)
*
* Tokenizes PDSL source code into a stream of tokens.
* Handles keywords, operators, identifiers, numbers, strings, and comments.
*/
// ============================================================================
// Token Types
// ============================================================================
export enum TokenType {
// Keywords
PROBABILISTIC_MODEL = 'PROBABILISTIC_MODEL',
OBSERVE = 'OBSERVE',
QUERY = 'QUERY',
LEARN = 'LEARN',
PARAMETERS = 'PARAMETERS',
FROM = 'FROM',
DATASET = 'DATASET',
NOT = 'NOT',
TRUE = 'TRUE',
FALSE = 'FALSE',
// Identifiers and literals
VARIABLE = 'VARIABLE', // X, Y, Person (uppercase start)
CONSTANT = 'CONSTANT', // alice, flu (lowercase start)
PROBABILITY = 'PROBABILITY', // 0.7, 0.95
NUMBER = 'NUMBER', // 42, 3.14
STRING = 'STRING', // "file.csv"
// Operators
PROB_ANNOTATION = 'PROB_ANNOTATION', // ::
IMPLICATION = 'IMPLICATION', // :-
COMMA = 'COMMA', // ,
SEMICOLON = 'SEMICOLON', // ;
// Delimiters
LPAREN = 'LPAREN', // (
RPAREN = 'RPAREN', // )
LBRACE = 'LBRACE', // {
RBRACE = 'RBRACE', // }
LBRACKET = 'LBRACKET', // [
RBRACKET = 'RBRACKET', // ]
// Special
COMMENT = 'COMMENT', // # comment
NEWLINE = 'NEWLINE', // \n
EOF = 'EOF', // End of file
UNKNOWN = 'UNKNOWN' // Error token
}
export interface Token {
type: TokenType;
value: string;
line: number;
column: number;
length: number;
}
// ============================================================================
// Lexer Class
// ============================================================================
export class Lexer {
private source: string;
private position: number = 0;
private line: number = 1;
private column: number = 1;
private tokens: Token[] = [];
constructor(source: string) {
this.source = source;
}
/**
* Tokenize the entire source code
*/
public tokenize(): Token[] {
while (!this.isAtEnd()) {
this.skipWhitespace();
if (this.isAtEnd()) break;
const token = this.scanToken();
if (token.type !== TokenType.COMMENT) {
this.tokens.push(token);
}
}
this.tokens.push(this.createToken(TokenType.EOF, ''));
return this.tokens;
}
/**
* Scan a single token
*/
private scanToken(): Token {
const char = this.peek();
// Comments
if (char === '#') {
return this.scanComment();
}
// Two-character operators
if (char === ':' && this.peekNext() === ':') {
return this.scanTwoChar(TokenType.PROB_ANNOTATION, '::');
}
if (char === ':' && this.peekNext() === '-') {
return this.scanTwoChar(TokenType.IMPLICATION, ':-');
}
// Numbers (including probabilities)
if (this.isDigit(char)) {
return this.scanNumber();
}
// Strings
if (char === '"') {
return this.scanString();
}
// Keywords, identifiers, variables
if (this.isAlpha(char) || char === '_') {
return this.scanIdentifier();
}
// Single-character tokens
switch (char) {
case '(': return this.scanSingleChar(TokenType.LPAREN);
case ')': return this.scanSingleChar(TokenType.RPAREN);
case '{': return this.scanSingleChar(TokenType.LBRACE);
case '}': return this.scanSingleChar(TokenType.RBRACE);
case '[': return this.scanSingleChar(TokenType.LBRACKET);
case ']': return this.scanSingleChar(TokenType.RBRACKET);
case ',': return this.scanSingleChar(TokenType.COMMA);
case ';': return this.scanSingleChar(TokenType.SEMICOLON);
default:
this.advance();
return this.createToken(TokenType.UNKNOWN, char);
}
}
/**
* Scan a comment (from # to end of line)
*/
private scanComment(): Token {
const start = this.position;
const startColumn = this.column;
this.advance(); // Skip #
while (!this.isAtEnd() && this.peek() !== '\n') {
this.advance();
}
const value = this.source.substring(start, this.position);
return {
type: TokenType.COMMENT,
value,
line: this.line,
column: startColumn,
length: value.length
};
}
/**
* Scan a number (integer or float)
*/
private scanNumber(): Token {
const start = this.position;
const startColumn = this.column;
while (this.isDigit(this.peek())) {
this.advance();
}
// Decimal point
if (this.peek() === '.' && this.isDigit(this.peekNext())) {
this.advance(); // consume '.'
while (this.isDigit(this.peek())) {
this.advance();
}
}
const value = this.source.substring(start, this.position);
const numValue = parseFloat(value);
// Determine if it's a probability or general number
const type = (numValue >= 0 && numValue <= 1.0)
? TokenType.PROBABILITY
: TokenType.NUMBER;
return {
type,
value,
line: this.line,
column: startColumn,
length: value.length
};
}
/**
* Scan a string literal
*/
private scanString(): Token {
const start = this.position;
const startColumn = this.column;
const startLine = this.line;
this.advance(); // opening "
while (!this.isAtEnd() && this.peek() !== '"') {
if (this.peek() === '\n') {
this.line++;
this.column = 0;
}
this.advance();
}
if (this.isAtEnd()) {
throw new LexerError('Unterminated string', startLine, startColumn);
}
this.advance(); // closing "
const value = this.source.substring(start + 1, this.position - 1);
return {
type: TokenType.STRING,
value,
line: startLine,
column: startColumn,
length: this.position - start
};
}
/**
* Scan an identifier, variable, or keyword
*/
private scanIdentifier(): Token {
const start = this.position;
const startColumn = this.column;
while (this.isAlphaNumeric(this.peek()) || this.peek() === '_') {
this.advance();
}
const value = this.source.substring(start, this.position);
// Check for keywords
const keywordType = this.getKeywordType(value);
if (keywordType !== null) {
return {
type: keywordType,
value,
line: this.line,
column: startColumn,
length: value.length
};
}
// Determine if variable or constant based on first character
const type = this.isUpperCase(value[0]) ? TokenType.VARIABLE : TokenType.CONSTANT;
return {
type,
value,
line: this.line,
column: startColumn,
length: value.length
};
}
/**
* Check if identifier is a keyword
*/
private getKeywordType(value: string): TokenType | null {
const keywords: Record<string, TokenType> = {
'probabilistic_model': TokenType.PROBABILISTIC_MODEL,
'observe': TokenType.OBSERVE,
'query': TokenType.QUERY,
'learn': TokenType.LEARN,
'parameters': TokenType.PARAMETERS,
'from': TokenType.FROM,
'dataset': TokenType.DATASET,
'not': TokenType.NOT,
'true': TokenType.TRUE,
'false': TokenType.FALSE,
};
return keywords[value] || null;
}
/**
* Scan a single character token
*/
private scanSingleChar(type: TokenType): Token {
const char = this.advance();
return this.createToken(type, char);
}
/**
* Scan a two-character token
*/
private scanTwoChar(type: TokenType, value: string): Token {
const startColumn = this.column;
this.advance();
this.advance();
return {
type,
value,
line: this.line,
column: startColumn,
length: 2
};
}
/**
* Create a token
*/
private createToken(type: TokenType, value: string): Token {
return {
type,
value,
line: this.line,
column: this.column - value.length,
length: value.length
};
}
// ==========================================================================
// Character Classification
// ==========================================================================
private isAlpha(char: string): boolean {
return /[a-zA-Z]/.test(char);
}
private isDigit(char: string): boolean {
return /[0-9]/.test(char);
}
private isAlphaNumeric(char: string): boolean {
return this.isAlpha(char) || this.isDigit(char);
}
private isUpperCase(char: string): boolean {
return /[A-Z]/.test(char);
}
// ==========================================================================
// Position Management
// ==========================================================================
private peek(): string {
if (this.isAtEnd()) return '\0';
return this.source[this.position];
}
private peekNext(): string {
if (this.position + 1 >= this.source.length) return '\0';
return this.source[this.position + 1];
}
private advance(): string {
const char = this.source[this.position++];
this.column++;
return char;
}
private isAtEnd(): boolean {
return this.position >= this.source.length;
}
/**
* Skip whitespace (spaces, tabs, newlines)
*/
private skipWhitespace(): void {
while (!this.isAtEnd()) {
const char = this.peek();
if (char === ' ' || char === '\t' || char === '\r') {
this.advance();
} else if (char === '\n') {
this.line++;
this.column = 1;
this.advance();
} else {
break;
}
}
}
}
// ============================================================================
// Error Classes
// ============================================================================
export class LexerError extends Error {
constructor(
message: string,
public line: number,
public column: number
) {
super(`Lexer error at line ${line}, column ${column}: ${message}`);
this.name = 'LexerError';
}
}
// ============================================================================
// Utility Functions
// ============================================================================
/**
* Tokenize PDSL source code
*/
export function tokenize(source: string): Token[] {
const lexer = new Lexer(source);
return lexer.tokenize();
}