// Placeholder tokenize implementation for tests; will refine after tests added.
const STOP_WORDS_EN = new Set(['the','a','an','of','and','or']);
const STOP_WORDS_ZH = new Set(['的','了','和']);
export function normalize(str){
if(!str) return '';
// full-width to half-width basic subset
return str.replace(/[A-Za-z0-9]/g, ch => String.fromCharCode(ch.charCodeAt(0)-0xFEE0)).toLowerCase();
}
export function tokenize(text){
const norm = normalize(text);
const raw = norm.split(/[^\p{L}\p{N}]+/u).filter(Boolean);
return raw.filter(tok=>!(STOP_WORDS_EN.has(tok) || STOP_WORDS_ZH.has(tok)));
}
export function tokenizeDistinct(text){
return Array.from(new Set(tokenize(text)));
}