/**
* Lexer for the Constraint Specification DSL
*
* Tokenizes CSDL source code into a stream of tokens for parsing.
* Supports all CSDL operators, keywords, literals, and identifiers.
*/
// ============================================================================
// Token Types
// ============================================================================
export enum TokenType {
// Keywords
VAR = 'VAR',
CONST = 'CONST',
FUNCTION = 'FUNCTION',
CONSTRAINT_TYPE = 'CONSTRAINT_TYPE',
IF = 'IF',
THEN = 'THEN',
ELSE = 'ELSE',
FORALL = 'FORALL',
EXISTS = 'EXISTS',
IN = 'IN',
ASSERT = 'ASSERT',
SOFT = 'SOFT',
CHECK_SAT = 'CHECK_SAT',
MINIMIZE = 'MINIMIZE',
MAXIMIZE = 'MAXIMIZE',
WEIGHT = 'WEIGHT',
WHERE = 'WHERE',
FOR = 'FOR',
LAMBDA = 'LAMBDA',
MATCH = 'MATCH',
CASE = 'CASE',
DATATYPE = 'DATATYPE',
// Type keywords
INT = 'INT',
REAL = 'REAL',
BOOL = 'BOOL',
STRING = 'STRING',
BITVEC = 'BITVEC',
ARRAY = 'ARRAY',
TUPLE = 'TUPLE',
SET = 'SET',
MAP = 'MAP',
// Literals
NUMBER = 'NUMBER',
STRING_LITERAL = 'STRING_LITERAL',
TRUE = 'TRUE',
FALSE = 'FALSE',
// Identifiers
IDENTIFIER = 'IDENTIFIER',
// Arithmetic operators
PLUS = 'PLUS', // +
MINUS = 'MINUS', // -
STAR = 'STAR', // *
SLASH = 'SLASH', // /
DIV = 'DIV', // div
PERCENT = 'PERCENT', // %
POWER = 'POWER', // **
// Comparison operators
EQ = 'EQ', // ==
NE = 'NE', // !=
LT = 'LT', // <
LE = 'LE', // <=
GT = 'GT', // >
GE = 'GE', // >=
// Logical operators
AND = 'AND', // and, &&
OR = 'OR', // or, ||
NOT = 'NOT', // not, !
IMPLIES = 'IMPLIES', // =>, implies
IFF = 'IFF', // <=>, iff
XOR = 'XOR', // xor
// Bitwise operators
BIT_AND = 'BIT_AND', // &
BIT_OR = 'BIT_OR', // |
BIT_XOR = 'BIT_XOR', // ^
BIT_NOT = 'BIT_NOT', // ~
LSHIFT = 'LSHIFT', // <<
RSHIFT = 'RSHIFT', // >>
ARSHIFT = 'ARSHIFT', // >>>
// Delimiters
LPAREN = 'LPAREN', // (
RPAREN = 'RPAREN', // )
LBRACKET = 'LBRACKET', // [
RBRACKET = 'RBRACKET', // ]
LBRACE = 'LBRACE', // {
RBRACE = 'RBRACE', // }
// Punctuation
COMMA = 'COMMA', // ,
COLON = 'COLON', // :
SEMICOLON = 'SEMICOLON', // ;
DOT = 'DOT', // .
DOTDOT = 'DOTDOT', // ..
DOTDOTLT = 'DOTDOTLT', // ..<
ARROW = 'ARROW', // ->
ASSIGN = 'ASSIGN', // =
PIPE = 'PIPE', // | (for pattern matching)
// Special keywords
CONTAINS = 'CONTAINS',
STARTS_WITH = 'STARTS_WITH',
ENDS_WITH = 'ENDS_WITH',
MATCHES = 'MATCHES',
UNION = 'UNION',
INTERSECT = 'INTERSECT',
DIFF = 'DIFF',
// Special
COMMENT = 'COMMENT',
NEWLINE = 'NEWLINE',
EOF = 'EOF',
}
// ============================================================================
// Token Structure
// ============================================================================
export interface Token {
type: TokenType;
value: string;
line: number;
column: number;
length: number;
}
// ============================================================================
// Lexer Class
// ============================================================================
export class Lexer {
private source: string;
private position: number = 0;
private line: number = 1;
private column: number = 1;
private tokens: Token[] = [];
constructor(source: string) {
this.source = source;
}
/**
* Tokenize the entire source code
*/
public tokenize(): Token[] {
this.tokens = [];
this.position = 0;
this.line = 1;
this.column = 1;
while (!this.isAtEnd()) {
const token = this.nextToken();
if (token.type !== TokenType.COMMENT) {
this.tokens.push(token);
}
}
this.tokens.push(this.makeToken(TokenType.EOF, '', 0));
return this.tokens;
}
/**
* Get the next token from the source
*/
public nextToken(): Token {
this.skipWhitespace();
if (this.isAtEnd()) {
return this.makeToken(TokenType.EOF, '', 0);
}
const startColumn = this.column;
const char = this.peek();
// Comments
if (char === '#') {
return this.scanComment();
}
// String literals
if (char === '"' || char === "'") {
return this.scanString();
}
// Numbers
if (this.isDigit(char)) {
return this.scanNumber();
}
// Identifiers and keywords
if (this.isAlpha(char) || char === '_') {
return this.scanIdentifier();
}
// Operators and punctuation
return this.scanOperator();
}
// ==========================================================================
// Scanning Methods
// ==========================================================================
private scanComment(): Token {
const start = this.position;
const startColumn = this.column;
this.advance(); // Skip #
while (!this.isAtEnd() && this.peek() !== '\n') {
this.advance();
}
const value = this.source.substring(start, this.position);
return this.makeToken(TokenType.COMMENT, value, this.position - start, startColumn);
}
private scanString(): Token {
const startColumn = this.column;
const quote = this.advance();
const start = this.position;
const chars: string[] = [];
while (!this.isAtEnd() && this.peek() !== quote) {
if (this.peek() === '\\') {
this.advance(); // Skip backslash
if (!this.isAtEnd()) {
const escaped = this.advance();
// Handle escape sequences
switch (escaped) {
case 'n': chars.push('\n'); break;
case 't': chars.push('\t'); break;
case 'r': chars.push('\r'); break;
case '\\': chars.push('\\'); break;
case '"': chars.push('"'); break;
case "'": chars.push("'"); break;
default: chars.push(escaped);
}
}
} else {
chars.push(this.advance());
}
}
if (this.isAtEnd()) {
throw new Error(`Unterminated string at line ${this.line}, column ${startColumn}`);
}
this.advance(); // Closing quote
const value = chars.join('');
const length = this.position - start + 2; // Include quotes
return this.makeToken(TokenType.STRING_LITERAL, value, length, startColumn);
}
private scanNumber(): Token {
const start = this.position;
const startColumn = this.column;
// Integer part
while (this.isDigit(this.peek())) {
this.advance();
}
// Decimal part
if (this.peek() === '.' && this.isDigit(this.peekNext())) {
this.advance(); // Consume '.'
while (this.isDigit(this.peek())) {
this.advance();
}
}
// Scientific notation
if (this.peek() === 'e' || this.peek() === 'E') {
this.advance();
if (this.peek() === '+' || this.peek() === '-') {
this.advance();
}
if (!this.isDigit(this.peek())) {
throw new Error(`Invalid number format at line ${this.line}, column ${this.column}`);
}
while (this.isDigit(this.peek())) {
this.advance();
}
}
const value = this.source.substring(start, this.position);
return this.makeToken(TokenType.NUMBER, value, this.position - start, startColumn);
}
private scanIdentifier(): Token {
const start = this.position;
const startColumn = this.column;
while (this.isAlphaNumeric(this.peek())) {
this.advance();
}
const value = this.source.substring(start, this.position);
const type = this.getKeywordType(value);
return this.makeToken(type, value, this.position - start, startColumn);
}
private scanOperator(): Token {
const startColumn = this.column;
const char = this.advance();
switch (char) {
case '+':
return this.makeToken(TokenType.PLUS, '+', 1, startColumn);
case '-':
if (this.match('>')) {
return this.makeToken(TokenType.ARROW, '->', 2, startColumn);
}
return this.makeToken(TokenType.MINUS, '-', 1, startColumn);
case '*':
if (this.match('*')) {
return this.makeToken(TokenType.POWER, '**', 2, startColumn);
}
return this.makeToken(TokenType.STAR, '*', 1, startColumn);
case '/':
return this.makeToken(TokenType.SLASH, '/', 1, startColumn);
case '%':
return this.makeToken(TokenType.PERCENT, '%', 1, startColumn);
case '=':
if (this.match('=')) {
return this.makeToken(TokenType.EQ, '==', 2, startColumn);
}
if (this.match('>')) {
return this.makeToken(TokenType.IMPLIES, '=>', 2, startColumn);
}
return this.makeToken(TokenType.ASSIGN, '=', 1, startColumn);
case '!':
if (this.match('=')) {
return this.makeToken(TokenType.NE, '!=', 2, startColumn);
}
return this.makeToken(TokenType.NOT, '!', 1, startColumn);
case '<':
if (this.match('=')) {
if (this.match('>')) {
return this.makeToken(TokenType.IFF, '<=>', 3, startColumn);
}
return this.makeToken(TokenType.LE, '<=', 2, startColumn);
}
if (this.match('<')) {
return this.makeToken(TokenType.LSHIFT, '<<', 2, startColumn);
}
return this.makeToken(TokenType.LT, '<', 1, startColumn);
case '>':
if (this.match('=')) {
return this.makeToken(TokenType.GE, '>=', 2, startColumn);
}
if (this.match('>')) {
if (this.match('>')) {
return this.makeToken(TokenType.ARSHIFT, '>>>', 3, startColumn);
}
return this.makeToken(TokenType.RSHIFT, '>>', 2, startColumn);
}
return this.makeToken(TokenType.GT, '>', 1, startColumn);
case '&':
if (this.match('&')) {
return this.makeToken(TokenType.AND, '&&', 2, startColumn);
}
return this.makeToken(TokenType.BIT_AND, '&', 1, startColumn);
case '|':
if (this.match('|')) {
return this.makeToken(TokenType.OR, '||', 2, startColumn);
}
return this.makeToken(TokenType.PIPE, '|', 1, startColumn);
case '^':
return this.makeToken(TokenType.BIT_XOR, '^', 1, startColumn);
case '~':
return this.makeToken(TokenType.BIT_NOT, '~', 1, startColumn);
case '(':
return this.makeToken(TokenType.LPAREN, '(', 1, startColumn);
case ')':
return this.makeToken(TokenType.RPAREN, ')', 1, startColumn);
case '[':
return this.makeToken(TokenType.LBRACKET, '[', 1, startColumn);
case ']':
return this.makeToken(TokenType.RBRACKET, ']', 1, startColumn);
case '{':
return this.makeToken(TokenType.LBRACE, '{', 1, startColumn);
case '}':
return this.makeToken(TokenType.RBRACE, '}', 1, startColumn);
case ',':
return this.makeToken(TokenType.COMMA, ',', 1, startColumn);
case ':':
return this.makeToken(TokenType.COLON, ':', 1, startColumn);
case ';':
return this.makeToken(TokenType.SEMICOLON, ';', 1, startColumn);
case '.':
if (this.match('.')) {
if (this.match('<')) {
return this.makeToken(TokenType.DOTDOTLT, '..<', 3, startColumn);
}
return this.makeToken(TokenType.DOTDOT, '..', 2, startColumn);
}
return this.makeToken(TokenType.DOT, '.', 1, startColumn);
default:
throw new Error(
`Unexpected character '${char}' at line ${this.line}, column ${startColumn}`
);
}
}
// ==========================================================================
// Helper Methods
// ==========================================================================
private getKeywordType(value: string): TokenType {
const keywords: { [key: string]: TokenType } = {
'var': TokenType.VAR,
'const': TokenType.CONST,
'function': TokenType.FUNCTION,
'constraint_type': TokenType.CONSTRAINT_TYPE,
'if': TokenType.IF,
'then': TokenType.THEN,
'else': TokenType.ELSE,
'forall': TokenType.FORALL,
'exists': TokenType.EXISTS,
'in': TokenType.IN,
'assert': TokenType.ASSERT,
'soft': TokenType.SOFT,
'check_sat': TokenType.CHECK_SAT,
'minimize': TokenType.MINIMIZE,
'maximize': TokenType.MAXIMIZE,
'weight': TokenType.WEIGHT,
'where': TokenType.WHERE,
'for': TokenType.FOR,
'lambda': TokenType.LAMBDA,
'match': TokenType.MATCH,
'case': TokenType.CASE,
'datatype': TokenType.DATATYPE,
// Types
'Int': TokenType.INT,
'Real': TokenType.REAL,
'Bool': TokenType.BOOL,
'String': TokenType.STRING,
'BitVec': TokenType.BITVEC,
'Array': TokenType.ARRAY,
'Tuple': TokenType.TUPLE,
'Set': TokenType.SET,
'Map': TokenType.MAP,
// Boolean literals
'true': TokenType.TRUE,
'false': TokenType.FALSE,
// Logical operators (word forms)
'and': TokenType.AND,
'or': TokenType.OR,
'not': TokenType.NOT,
'implies': TokenType.IMPLIES,
'iff': TokenType.IFF,
'xor': TokenType.XOR,
// Arithmetic operators (word forms)
'div': TokenType.DIV,
// Set/String operators
'contains': TokenType.CONTAINS,
'starts_with': TokenType.STARTS_WITH,
'ends_with': TokenType.ENDS_WITH,
'matches': TokenType.MATCHES,
'union': TokenType.UNION,
'intersect': TokenType.INTERSECT,
'diff': TokenType.DIFF,
};
return keywords[value] || TokenType.IDENTIFIER;
}
private skipWhitespace(): void {
while (!this.isAtEnd()) {
const char = this.peek();
if (char === ' ' || char === '\t' || char === '\r') {
this.advance();
} else if (char === '\n') {
this.line++;
this.column = 0;
this.advance();
} else {
break;
}
}
}
private advance(): string {
const char = this.source[this.position];
this.position++;
this.column++;
return char;
}
private match(expected: string): boolean {
if (this.isAtEnd() || this.peek() !== expected) {
return false;
}
this.advance();
return true;
}
private peek(): string {
if (this.isAtEnd()) return '\0';
return this.source[this.position];
}
private peekNext(): string {
if (this.position + 1 >= this.source.length) return '\0';
return this.source[this.position + 1];
}
private isAtEnd(): boolean {
return this.position >= this.source.length;
}
private isDigit(char: string): boolean {
return char >= '0' && char <= '9';
}
private isAlpha(char: string): boolean {
return (char >= 'a' && char <= 'z') ||
(char >= 'A' && char <= 'Z') ||
char === '_';
}
private isAlphaNumeric(char: string): boolean {
return this.isAlpha(char) || this.isDigit(char);
}
private makeToken(type: TokenType, value: string, length: number, column?: number): Token {
return {
type,
value,
line: this.line,
column: column || this.column - length,
length,
};
}
}
// ============================================================================
// Utility Functions
// ============================================================================
/**
* Tokenize CSDL source code
*/
export function tokenize(source: string): Token[] {
const lexer = new Lexer(source);
return lexer.tokenize();
}
/**
* Pretty-print tokens for debugging
*/
export function tokensToString(tokens: Token[]): string {
return tokens
.map(t => `${t.type.padEnd(20)} "${t.value}" (${t.line}:${t.column})`)
.join('\n');
}