index-xnf.mjs•53.3 kB
// created 2023-09-25T01:01:55.148Z
// compressed base64-encoded blob for include-ens data
// source: https://github.com/adraffy/ens-normalize.js/blob/main/src/make.js
// see: https://github.com/adraffy/ens-normalize.js#security
// SHA-256: 0565ed049b9cf1614bb9e11ba7d8ac6a6fb96c893253d890f7e2b2884b9ded32
var COMPRESSED = '';
const FENCED = new Map([[8217,"apostrophe"],[8260,"fraction slash"],[12539,"middle dot"]]);
const NSM_MAX = 4;
function decode_arithmetic(bytes) {
let pos = 0;
function u16() { return (bytes[pos++] << 8) | bytes[pos++]; }
// decode the frequency table
let symbol_count = u16();
let total = 1;
let acc = [0, 1]; // first symbol has frequency 1
for (let i = 1; i < symbol_count; i++) {
acc.push(total += u16());
}
// skip the sized-payload that the last 3 symbols index into
let skip = u16();
let pos_payload = pos;
pos += skip;
let read_width = 0;
let read_buffer = 0;
function read_bit() {
if (read_width == 0) {
// this will read beyond end of buffer
// but (undefined|0) => zero pad
read_buffer = (read_buffer << 8) | bytes[pos++];
read_width = 8;
}
return (read_buffer >> --read_width) & 1;
}
const N = 31;
const FULL = 2**N;
const HALF = FULL >>> 1;
const QRTR = HALF >> 1;
const MASK = FULL - 1;
// fill register
let register = 0;
for (let i = 0; i < N; i++) register = (register << 1) | read_bit();
let symbols = [];
let low = 0;
let range = FULL; // treat like a float
while (true) {
let value = Math.floor((((register - low + 1) * total) - 1) / range);
let start = 0;
let end = symbol_count;
while (end - start > 1) { // binary search
let mid = (start + end) >>> 1;
if (value < acc[mid]) {
end = mid;
} else {
start = mid;
}
}
if (start == 0) break; // first symbol is end mark
symbols.push(start);
let a = low + Math.floor(range * acc[start] / total);
let b = low + Math.floor(range * acc[start+1] / total) - 1;
while (((a ^ b) & HALF) == 0) {
register = (register << 1) & MASK | read_bit();
a = (a << 1) & MASK;
b = (b << 1) & MASK | 1;
}
while (a & ~b & QRTR) {
register = (register & HALF) | ((register << 1) & (MASK >>> 1)) | read_bit();
a = (a << 1) ^ HALF;
b = ((b ^ HALF) << 1) | HALF | 1;
}
low = a;
range = 1 + b - a;
}
let offset = symbol_count - 4;
return symbols.map(x => { // index into payload
switch (x - offset) {
case 3: return offset + 0x10100 + ((bytes[pos_payload++] << 16) | (bytes[pos_payload++] << 8) | bytes[pos_payload++]);
case 2: return offset + 0x100 + ((bytes[pos_payload++] << 8) | bytes[pos_payload++]);
case 1: return offset + bytes[pos_payload++];
default: return x - 1;
}
});
}
// returns an iterator which returns the next symbol
function read_payload(v) {
let pos = 0;
return () => v[pos++];
}
function read_compressed_payload(s) {
return read_payload(decode_arithmetic(unsafe_atob(s)));
}
// unsafe in the sense:
// expected well-formed Base64 w/o padding
// 20220922: added for https://github.com/adraffy/ens-normalize.js/issues/4
function unsafe_atob(s) {
let lookup = [];
[...'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'].forEach((c, i) => lookup[c.charCodeAt(0)] = i);
let n = s.length;
let ret = new Uint8Array((6 * n) >> 3);
for (let i = 0, pos = 0, width = 0, carry = 0; i < n; i++) {
carry = (carry << 6) | lookup[s.charCodeAt(i)];
width += 6;
if (width >= 8) {
ret[pos++] = (carry >> (width -= 8));
}
}
return ret;
}
// eg. [0,1,2,3...] => [0,-1,1,-2,...]
function signed(i) {
return (i & 1) ? (~i >> 1) : (i >> 1);
}
function read_deltas(n, next) {
let v = Array(n);
for (let i = 0, x = 0; i < n; i++) v[i] = x += signed(next());
return v;
}
// [123][5] => [0 3] [1 1] [0 0]
function read_sorted(next, prev = 0) {
let ret = [];
while (true) {
let x = next();
let n = next();
if (!n) break;
prev += x;
for (let i = 0; i < n; i++) {
ret.push(prev + i);
}
prev += n + 1;
}
return ret;
}
function read_sorted_arrays(next) {
return read_array_while(() => {
let v = read_sorted(next);
if (v.length) return v;
});
}
// returns map of x => ys
function read_mapped(next) {
let ret = [];
while (true) {
let w = next();
if (w == 0) break;
ret.push(read_linear_table(w, next));
}
while (true) {
let w = next() - 1;
if (w < 0) break;
ret.push(read_replacement_table(w, next));
}
return ret.flat();
}
// read until next is falsy
// return array of read values
function read_array_while(next) {
let v = [];
while (true) {
let x = next(v.length);
if (!x) break;
v.push(x);
}
return v;
}
// read w columns of length n
// return as n rows of length w
function read_transposed(n, w, next) {
let m = Array(n).fill().map(() => []);
for (let i = 0; i < w; i++) {
read_deltas(n, next).forEach((x, j) => m[j].push(x));
}
return m;
}
// returns [[x, ys], [x+dx, ys+dy], [x+2*dx, ys+2*dy], ...]
// where dx/dy = steps, n = run size, w = length of y
function read_linear_table(w, next) {
let dx = 1 + next();
let dy = next();
let vN = read_array_while(next);
let m = read_transposed(vN.length, 1+w, next);
return m.flatMap((v, i) => {
let [x, ...ys] = v;
return Array(vN[i]).fill().map((_, j) => {
let j_dy = j * dy;
return [x + j * dx, ys.map(y => y + j_dy)];
});
});
}
// return [[x, ys...], ...]
// where w = length of y
function read_replacement_table(w, next) {
let n = 1 + next();
let m = read_transposed(n, 1+w, next);
return m.map(v => [v[0], v.slice(1)]);
}
function read_trie(next) {
let ret = [];
let sorted = read_sorted(next);
expand(decode([]), []);
return ret; // not sorted
function decode(Q) { // characters that lead into this node
let S = next(); // state: valid, save, check
let B = read_array_while(() => { // buckets leading to new nodes
let cps = read_sorted(next).map(i => sorted[i]);
if (cps.length) return decode(cps);
});
return {S, B, Q};
}
function expand({S, B}, cps, saved) {
if (S & 4 && saved === cps[cps.length-1]) return;
if (S & 2) saved = cps[cps.length-1];
if (S & 1) ret.push(cps);
for (let br of B) {
for (let cp of br.Q) {
expand(br, [...cps, cp], saved);
}
}
}
}
function hex_cp(cp) {
return cp.toString(16).toUpperCase().padStart(2, '0');
}
function quote_cp(cp) {
return `{${hex_cp(cp)}}`; // raffy convention: like "\u{X}" w/o the "\u"
}
/*
export function explode_cp(s) {
return [...s].map(c => c.codePointAt(0));
}
*/
function explode_cp(s) { // this is about 2x faster
let cps = [];
for (let pos = 0, len = s.length; pos < len; ) {
let cp = s.codePointAt(pos);
pos += cp < 0x10000 ? 1 : 2;
cps.push(cp);
}
return cps;
}
function str_from_cps(cps) {
const chunk = 4096;
let len = cps.length;
if (len < chunk) return String.fromCodePoint(...cps);
let buf = [];
for (let i = 0; i < len; ) {
buf.push(String.fromCodePoint(...cps.slice(i, i += chunk)));
}
return buf.join('');
}
function compare_arrays(a, b) {
let n = a.length;
let c = n - b.length;
for (let i = 0; c == 0 && i < n; i++) c = a[i] - b[i];
return c;
}
// reverse polyfill
// replace custom nf implementation with system implementation
// (saves approximately 6KB)
function nf(cps, form) {
return explode_cp(str_from_cps(cps).normalize(form));
}
function nfc(cps) {
return nf(cps, 'NFC');
}
function nfd(cps) {
return nf(cps, 'NFD');
}
const HYPHEN = 0x2D;
const STOP = 0x2E;
const STOP_CH = '.';
const FE0F = 0xFE0F;
const UNIQUE_PH = 1;
// 20230913: replace [...v] with Array_from(v) to avoid large spreads
const Array_from = x => Array.from(x); // Array.from.bind(Array);
function group_has_cp(g, cp) {
// 20230913: keep primary and secondary distinct instead of creating valid union
return g.P.has(cp) || g.Q.has(cp);
}
class Emoji extends Array {
get is_emoji() { return true; } // free tagging system
}
let MAPPED, IGNORED, CM, NSM, ESCAPE, NFC_CHECK, GROUPS, WHOLE_VALID, WHOLE_MAP, VALID, EMOJI_LIST, EMOJI_ROOT;
function init() {
if (MAPPED) return;
let r = read_compressed_payload(COMPRESSED);
const read_sorted_array = () => read_sorted(r);
const read_sorted_set = () => new Set(read_sorted_array());
const set_add_many = (set, v) => v.forEach(x => set.add(x));
MAPPED = new Map(read_mapped(r));
IGNORED = read_sorted_set(); // ignored characters are not valid, so just read raw codepoints
/*
// direct include from payload is smaller than the decompression code
const FENCED = new Map(read_array_while(() => {
let cp = r();
if (cp) return [cp, read_str(r())];
}));
*/
// 20230217: we still need all CM for proper error formatting
// but norm only needs NSM subset that are potentially-valid
CM = read_sorted_array();
NSM = new Set(read_sorted_array().map(i => CM[i]));
CM = new Set(CM);
ESCAPE = read_sorted_set(); // characters that should not be printed
NFC_CHECK = read_sorted_set(); // only needed to illustrate ens_tokenize() transformations
let chunks = read_sorted_arrays(r);
let unrestricted = r();
//const read_chunked = () => new Set(read_sorted_array().flatMap(i => chunks[i]).concat(read_sorted_array()));
const read_chunked = () => {
// 20230921: build set in parts, 2x faster
let set = new Set();
read_sorted_array().forEach(i => set_add_many(set, chunks[i]));
set_add_many(set, read_sorted_array());
return set;
};
GROUPS = read_array_while(i => {
// minifier property mangling seems unsafe
// so these are manually renamed to single chars
let N = read_array_while(r).map(x => x+0x60);
if (N.length) {
let R = i >= unrestricted; // unrestricted then restricted
N[0] -= 32; // capitalize
N = str_from_cps(N);
if (R) N=`Restricted[${N}]`;
let P = read_chunked(); // primary
let Q = read_chunked(); // secondary
let M = !r(); // not-whitelisted, check for NSM
// *** this code currently isn't needed ***
/*
let V = [...P, ...Q].sort((a, b) => a-b); // derive: sorted valid
let M = r()-1; // number of combining mark
if (M < 0) { // whitelisted
M = new Map(read_array_while(() => {
let i = r();
if (i) return [V[i-1], read_array_while(() => {
let v = read_array_while(r);
if (v.length) return v.map(x => x-1);
})];
}));
}*/
return {N, P, Q, M, R};
}
});
// decode compressed wholes
WHOLE_VALID = read_sorted_set();
WHOLE_MAP = new Map();
let wholes = read_sorted_array().concat(Array_from(WHOLE_VALID)).sort((a, b) => a-b); // must be sorted
wholes.forEach((cp, i) => {
let d = r();
let w = wholes[i] = d ? wholes[i-d] : {V: [], M: new Map()};
w.V.push(cp); // add to member set
if (!WHOLE_VALID.has(cp)) {
WHOLE_MAP.set(cp, w); // register with whole map
}
});
// compute confusable-extent complements
// usage: WHOLE_MAP.get(cp).M.get(cp) = complement set
for (let {V, M} of new Set(WHOLE_MAP.values())) {
// connect all groups that have each whole character
let recs = [];
for (let cp of V) {
let gs = GROUPS.filter(g => group_has_cp(g, cp));
let rec = recs.find(({G}) => gs.some(g => G.has(g)));
if (!rec) {
rec = {G: new Set(), V: []};
recs.push(rec);
}
rec.V.push(cp);
set_add_many(rec.G, gs);
}
// per character cache groups which are not a member of the extent
let union = recs.flatMap(x => Array_from(x.G)); // all of the groups used by this whole
for (let {G, V} of recs) {
let complement = new Set(union.filter(g => !G.has(g))); // groups not covered by the extent
for (let cp of V) {
M.set(cp, complement); // this is the same reference
}
}
}
// compute valid set
// 20230924: VALID was union but can be re-used
VALID = new Set(); // exists in 1+ groups
let multi = new Set(); // exists in 2+ groups
const add_to_union = cp => VALID.has(cp) ? multi.add(cp) : VALID.add(cp);
for (let g of GROUPS) {
for (let cp of g.P) add_to_union(cp);
for (let cp of g.Q) add_to_union(cp);
}
// dual purpose WHOLE_MAP: return placeholder if unique non-confusable
for (let cp of VALID) {
if (!WHOLE_MAP.has(cp) && !multi.has(cp)) {
WHOLE_MAP.set(cp, UNIQUE_PH);
}
}
// add all decomposed parts
// see derive: "Valid is Closed (via Brute-force)"
set_add_many(VALID, nfd(VALID));
// decode emoji
// 20230719: emoji are now fully-expanded to avoid quirk logic
EMOJI_LIST = read_trie(r).map(v => Emoji.from(v)).sort(compare_arrays);
EMOJI_ROOT = new Map(); // this has approx 7K nodes (2+ per emoji)
for (let cps of EMOJI_LIST) {
// 20230719: change to *slightly* stricter algorithm which disallows
// insertion of misplaced FE0F in emoji sequences (matching ENSIP-15)
// example: beautified [A B] (eg. flag emoji)
// before: allow: [A FE0F B], error: [A FE0F FE0F B]
// after: error: both
// note: this code now matches ENSNormalize.{cs,java} logic
let prev = [EMOJI_ROOT];
for (let cp of cps) {
let next = prev.map(node => {
let child = node.get(cp);
if (!child) {
// should this be object?
// (most have 1-2 items, few have many)
// 20230719: no, v8 default map is 4?
child = new Map();
node.set(cp, child);
}
return child;
});
if (cp === FE0F) {
prev.push(...next); // less than 20 elements
} else {
prev = next;
}
}
for (let x of prev) {
x.V = cps;
}
}
}
// if escaped: {HEX}
// else: "x" {HEX}
function quoted_cp(cp) {
return (should_escape(cp) ? '' : `${bidi_qq(safe_str_from_cps([cp]))} `) + quote_cp(cp);
}
// 20230211: some messages can be mixed-directional and result in spillover
// use 200E after a quoted string to force the remainder of a string from
// acquring the direction of the quote
// https://www.w3.org/International/questions/qa-bidi-unicode-controls#exceptions
function bidi_qq(s) {
return `"${s}"\u200E`; // strong LTR
}
function check_label_extension(cps) {
if (cps.length >= 4 && cps[2] == HYPHEN && cps[3] == HYPHEN) {
throw new Error(`invalid label extension: "${str_from_cps(cps.slice(0, 4))}"`); // this can only be ascii so cant be bidi
}
}
function check_leading_underscore(cps) {
const UNDERSCORE = 0x5F;
for (let i = cps.lastIndexOf(UNDERSCORE); i > 0; ) {
if (cps[--i] !== UNDERSCORE) {
throw new Error('underscore allowed only at start');
}
}
}
// check that a fenced cp is not leading, trailing, or touching another fenced cp
function check_fenced(cps) {
let cp = cps[0];
let prev = FENCED.get(cp);
if (prev) throw error_placement(`leading ${prev}`);
let n = cps.length;
let last = -1; // prevents trailing from throwing
for (let i = 1; i < n; i++) {
cp = cps[i];
let match = FENCED.get(cp);
if (match) {
// since cps[0] isn't fenced, cps[1] cannot throw
if (last == i) throw error_placement(`${prev} + ${match}`);
last = i + 1;
prev = match;
}
}
if (last == n) throw error_placement(`trailing ${prev}`);
}
// create a safe to print string
// invisibles are escaped
// leading cm uses placeholder
// if cps exceed max, middle truncate with ellipsis
// quoter(cp) => string, eg. 3000 => "{3000}"
// note: in html, you'd call this function then replace [<>&] with entities
function safe_str_from_cps(cps, max = Infinity, quoter = quote_cp) {
//if (Number.isInteger(cps)) cps = [cps];
//if (!Array.isArray(cps)) throw new TypeError(`expected codepoints`);
let buf = [];
if (is_combining_mark(cps[0])) buf.push('◌');
if (cps.length > max) {
max >>= 1;
cps = [...cps.slice(0, max), 0x2026, ...cps.slice(-max)];
}
let prev = 0;
let n = cps.length;
for (let i = 0; i < n; i++) {
let cp = cps[i];
if (should_escape(cp)) {
buf.push(str_from_cps(cps.slice(prev, i)));
buf.push(quoter(cp));
prev = i + 1;
}
}
buf.push(str_from_cps(cps.slice(prev, n)));
return buf.join('');
}
// note: set(s) cannot be exposed because they can be modified
// note: Object.freeze() doesn't work
function is_combining_mark(cp) {
init();
return CM.has(cp);
}
function should_escape(cp) {
init();
return ESCAPE.has(cp);
}
// return all supported emoji as fully-qualified emoji
// ordered by length then lexicographic
function ens_emoji() {
init();
return EMOJI_LIST.map(x => x.slice()); // emoji are exposed so copy
}
function ens_normalize_fragment(frag, decompose) {
init();
let nf = decompose ? nfd : nfc;
return frag.split(STOP_CH).map(label => str_from_cps(tokens_from_str(explode_cp(label), nf, filter_fe0f).flat())).join(STOP_CH);
}
function ens_normalize(name) {
return flatten(split(name, nfc, filter_fe0f));
}
function ens_beautify(name) {
let labels = split(name, nfc, x => x); // emoji not exposed
for (let {type, output, error} of labels) {
if (error) break; // flatten will throw
// replace leading/trailing hyphen
// 20230121: consider beautifing all or leading/trailing hyphen to unicode variant
// not exactly the same in every font, but very similar: "-" vs "‐"
/*
const UNICODE_HYPHEN = 0x2010;
// maybe this should replace all for visual consistancy?
// `node tools/reg-count.js regex ^-\{2,\}` => 592
//for (let i = 0; i < output.length; i++) if (output[i] == 0x2D) output[i] = 0x2010;
if (output[0] == HYPHEN) output[0] = UNICODE_HYPHEN;
let end = output.length-1;
if (output[end] == HYPHEN) output[end] = UNICODE_HYPHEN;
*/
// 20230123: WHATWG URL uses "CheckHyphens" false
// https://url.spec.whatwg.org/#idna
// update ethereum symbol
// ξ => Ξ if not greek
if (type !== 'Greek') array_replace(output, 0x3BE, 0x39E);
// 20221213: fixes bidi subdomain issue, but breaks invariant (200E is disallowed)
// could be fixed with special case for: 2D (.) + 200E (LTR)
// https://discuss.ens.domains/t/bidi-label-ordering-spoof/15824
//output.splice(0, 0, 0x200E);
}
return flatten(labels);
}
function array_replace(v, a, b) {
let prev = 0;
while (true) {
let next = v.indexOf(a, prev);
if (next < 0) break;
v[next] = b;
prev = next + 1;
}
}
function ens_split(name, preserve_emoji) {
return split(name, nfc, preserve_emoji ? x => x.slice() : filter_fe0f); // emoji are exposed so copy
}
function split(name, nf, ef) {
if (!name) return []; // 20230719: empty name allowance
init();
let offset = 0;
// https://unicode.org/reports/tr46/#Validity_Criteria
// 4.) "The label must not contain a U+002E ( . ) FULL STOP."
return name.split(STOP_CH).map(label => {
let input = explode_cp(label);
let info = {
input,
offset, // codepoint, not substring!
};
offset += input.length + 1; // + stop
try {
// 1.) "The label must be in Unicode Normalization Form NFC"
let tokens = info.tokens = tokens_from_str(input, nf, ef);
let token_count = tokens.length;
let type;
if (!token_count) { // the label was effectively empty (could of had ignored characters)
//norm = [];
//type = 'None'; // use this instead of next match, "ASCII"
// 20230120: change to strict
// https://discuss.ens.domains/t/ens-name-normalization-2nd/14564/59
throw new Error(`empty label`);
}
let norm = info.output = tokens.flat();
check_leading_underscore(norm);
let emoji = info.emoji = token_count > 1 || tokens[0].is_emoji; // same as: tokens.some(x => x.is_emoji);
if (!emoji && norm.every(cp => cp < 0x80)) { // special case for ascii
// 20230123: matches matches WHATWG, see note 3.3
check_label_extension(norm); // only needed for ascii
// cant have fenced
// cant have cm
// cant have wholes
// see derive: "Fastpath ASCII"
type = 'ASCII';
} else {
let chars = tokens.flatMap(x => x.is_emoji ? [] : x); // all of the nfc tokens concat together
if (!chars.length) { // theres no text, just emoji
type = 'Emoji';
} else {
// 5.) "The label must not begin with a combining mark, that is: General_Category=Mark."
if (CM.has(norm[0])) throw error_placement('leading combining mark');
for (let i = 1; i < token_count; i++) { // we've already checked the first token
let cps = tokens[i];
if (!cps.is_emoji && CM.has(cps[0])) { // every text token has emoji neighbors, eg. EtEEEtEt...
// bidi_qq() not needed since emoji is LTR and cps is a CM
throw error_placement(`emoji + combining mark: "${str_from_cps(tokens[i-1])} + ${safe_str_from_cps([cps[0]])}"`);
}
}
check_fenced(norm);
let unique = Array_from(new Set(chars));
let [g] = determine_group(unique); // take the first match
// see derive: "Matching Groups have Same CM Style"
// alternative: could form a hybrid type: Latin/Japanese/...
check_group(g, chars); // need text in order
check_whole(g, unique); // only need unique text (order would be required for multiple-char confusables)
type = g.N;
// 20230121: consider exposing restricted flag
// it's simpler to just check for 'Restricted'
// or even better: type.endsWith(']')
//if (g.R) info.restricted = true;
}
}
info.type = type;
} catch (err) {
info.error = err; // use full error object
}
return info;
});
}
function check_whole(group, unique) {
let maker;
let shared = [];
for (let cp of unique) {
let whole = WHOLE_MAP.get(cp);
if (whole === UNIQUE_PH) return; // unique, non-confusable
if (whole) {
let set = whole.M.get(cp); // groups which have a character that look-like this character
maker = maker ? maker.filter(g => set.has(g)) : Array_from(set);
if (!maker.length) return; // confusable intersection is empty
} else {
shared.push(cp);
}
}
if (maker) {
// we have 1+ confusable
// check if any of the remaining groups
// contain the shared characters too
for (let g of maker) {
if (shared.every(cp => group_has_cp(g, cp))) {
throw new Error(`whole-script confusable: ${group.N}/${g.N}`);
}
}
}
}
// assumption: unique.size > 0
// returns list of matching groups
function determine_group(unique) {
let groups = GROUPS;
for (let cp of unique) {
// note: we need to dodge CM that are whitelisted
// but that code isn't currently necessary
let gs = groups.filter(g => group_has_cp(g, cp));
if (!gs.length) {
if (!GROUPS.some(g => group_has_cp(g, cp))) {
// the character was composed of valid parts
// but it's NFC form is invalid
// 20230716: change to more exact statement, see: ENSNormalize.{cs,java}
// note: this doesn't have to be a composition
// 20230720: change to full check
throw error_disallowed(cp); // this should be rare
} else {
// there is no group that contains all these characters
// throw using the highest priority group that matched
// https://www.unicode.org/reports/tr39/#mixed_script_confusables
throw error_group_member(groups[0], cp);
}
}
groups = gs;
if (gs.length == 1) break; // there is only one group left
}
// there are at least 1 group(s) with all of these characters
return groups;
}
// throw on first error
function flatten(split) {
return split.map(({input, error, output}) => {
if (error) {
// don't print label again if just a single label
let msg = error.message;
// bidi_qq() only necessary if msg is digits
throw new Error(split.length == 1 ? msg : `Invalid label ${bidi_qq(safe_str_from_cps(input, 63))}: ${msg}`);
}
return str_from_cps(output);
}).join(STOP_CH);
}
function error_disallowed(cp) {
// TODO: add cp to error?
return new Error(`disallowed character: ${quoted_cp(cp)}`);
}
function error_group_member(g, cp) {
let quoted = quoted_cp(cp);
let gg = GROUPS.find(g => g.P.has(cp)); // only check primary
if (gg) {
quoted = `${gg.N} ${quoted}`;
}
return new Error(`illegal mixture: ${g.N} + ${quoted}`);
}
function error_placement(where) {
return new Error(`illegal placement: ${where}`);
}
// assumption: cps.length > 0
// assumption: cps[0] isn't a CM
// assumption: the previous character isn't an emoji
function check_group(g, cps) {
for (let cp of cps) {
if (!group_has_cp(g, cp)) {
// for whitelisted scripts, this will throw illegal mixture on invalid cm, eg. "e{300}{300}"
// at the moment, it's unnecessary to introduce an extra error type
// until there exists a whitelisted multi-character
// eg. if (M < 0 && is_combining_mark(cp)) { ... }
// there are 3 cases:
// 1. illegal cm for wrong group => mixture error
// 2. illegal cm for same group => cm error
// requires set of whitelist cm per group:
// eg. new Set([...g.P, ...g.Q].flatMap(nfc).filter(cp => CM.has(cp)))
// 3. wrong group => mixture error
throw error_group_member(g, cp);
}
}
//if (M >= 0) { // we have a known fixed cm count
if (g.M) { // we need to check for NSM
let decomposed = nfd(cps);
for (let i = 1, e = decomposed.length; i < e; i++) { // see: assumption
// 20230210: bugfix: using cps instead of decomposed h/t Carbon225
/*
if (CM.has(decomposed[i])) {
let j = i + 1;
while (j < e && CM.has(decomposed[j])) j++;
if (j - i > M) {
throw new Error(`too many combining marks: ${g.N} ${bidi_qq(str_from_cps(decomposed.slice(i-1, j)))} (${j-i}/${M})`);
}
i = j;
}
*/
// 20230217: switch to NSM counting
// https://www.unicode.org/reports/tr39/#Optional_Detection
if (NSM.has(decomposed[i])) {
let j = i + 1;
for (let cp; j < e && NSM.has(cp = decomposed[j]); j++) {
// a. Forbid sequences of the same nonspacing mark.
for (let k = i; k < j; k++) { // O(n^2) but n < 100
if (decomposed[k] == cp) {
throw new Error(`duplicate non-spacing marks: ${quoted_cp(cp)}`);
}
}
}
// parse to end so we have full nsm count
// b. Forbid sequences of more than 4 nonspacing marks (gc=Mn or gc=Me).
if (j - i > NSM_MAX) {
// note: this slice starts with a base char or spacing-mark cm
throw new Error(`excessive non-spacing marks: ${bidi_qq(safe_str_from_cps(decomposed.slice(i-1, j)))} (${j-i}/${NSM_MAX})`);
}
i = j;
}
}
}
// *** this code currently isn't needed ***
/*
let cm_whitelist = M instanceof Map;
for (let i = 0, e = cps.length; i < e; ) {
let cp = cps[i++];
let seqs = cm_whitelist && M.get(cp);
if (seqs) {
// list of codepoints that can follow
// if this exists, this will always be 1+
let j = i;
while (j < e && CM.has(cps[j])) j++;
let cms = cps.slice(i, j);
let match = seqs.find(seq => !compare_arrays(seq, cms));
if (!match) throw new Error(`disallowed combining mark sequence: "${safe_str_from_cps([cp, ...cms])}"`);
i = j;
} else if (!V.has(cp)) {
// https://www.unicode.org/reports/tr39/#mixed_script_confusables
let quoted = quoted_cp(cp);
for (let cp of cps) {
let u = UNIQUE.get(cp);
if (u && u !== g) {
// if both scripts are restricted this error is confusing
// because we don't differentiate RestrictedA from RestrictedB
if (!u.R) quoted = `${quoted} is ${u.N}`;
break;
}
}
throw new Error(`disallowed ${g.N} character: ${quoted}`);
//throw new Error(`disallowed character: ${quoted} (expected ${g.N})`);
//throw new Error(`${g.N} does not allow: ${quoted}`);
}
}
if (!cm_whitelist) {
let decomposed = nfd(cps);
for (let i = 1, e = decomposed.length; i < e; i++) { // we know it can't be cm leading
if (CM.has(decomposed[i])) {
let j = i + 1;
while (j < e && CM.has(decomposed[j])) j++;
if (j - i > M) {
throw new Error(`too many combining marks: "${str_from_cps(decomposed.slice(i-1, j))}" (${j-i}/${M})`);
}
i = j;
}
}
}
*/
}
// given a list of codepoints
// returns a list of lists, where emoji are a fully-qualified (as Array subclass)
// eg. explode_cp("abc💩d") => [[61, 62, 63], Emoji[1F4A9, FE0F], [64]]
// 20230818: rename for 'process' name collision h/t Javarome
// https://github.com/adraffy/ens-normalize.js/issues/23
function tokens_from_str(input, nf, ef) {
let ret = [];
let chars = [];
input = input.slice().reverse(); // flip so we can pop
while (input.length) {
let emoji = consume_emoji_reversed(input);
if (emoji) {
if (chars.length) {
ret.push(nf(chars));
chars = [];
}
ret.push(ef(emoji));
} else {
let cp = input.pop();
if (VALID.has(cp)) {
chars.push(cp);
} else {
let cps = MAPPED.get(cp);
if (cps) {
chars.push(...cps); // less than 10 elements
} else if (!IGNORED.has(cp)) {
// 20230912: unicode 15.1 changed the order of processing such that
// disallowed parts are only rejected after NFC
// https://unicode.org/reports/tr46/#Validity_Criteria
// this doesn't impact normalization as of today
// technically, this error can be removed as the group logic will apply similar logic
// however the error type might be less clear
throw error_disallowed(cp);
}
}
}
}
if (chars.length) {
ret.push(nf(chars));
}
return ret;
}
function filter_fe0f(cps) {
return cps.filter(cp => cp != FE0F);
}
// given array of codepoints
// returns the longest valid emoji sequence (or undefined if no match)
// *MUTATES* the supplied array
// disallows interleaved ignored characters
// fills (optional) eaten array with matched codepoints
function consume_emoji_reversed(cps, eaten) {
let node = EMOJI_ROOT;
let emoji;
let pos = cps.length;
while (pos) {
node = node.get(cps[--pos]);
if (!node) break;
let {V} = node;
if (V) { // this is a valid emoji (so far)
emoji = V;
if (eaten) eaten.push(...cps.slice(pos).reverse()); // (optional) copy input, used for ens_tokenize()
cps.length = pos; // truncate
}
}
return emoji;
}
// ************************************************************
// tokenizer
const TY_VALID = 'valid';
const TY_MAPPED = 'mapped';
const TY_IGNORED = 'ignored';
const TY_DISALLOWED = 'disallowed';
const TY_EMOJI = 'emoji';
const TY_NFC = 'nfc';
const TY_STOP = 'stop';
function ens_tokenize(name, {
nf = true, // collapse unnormalized runs into a single token
} = {}) {
init();
let input = explode_cp(name).reverse();
let eaten = [];
let tokens = [];
while (input.length) {
let emoji = consume_emoji_reversed(input, eaten);
if (emoji) {
tokens.push({
type: TY_EMOJI,
emoji: emoji.slice(), // copy emoji
input: eaten,
cps: filter_fe0f(emoji)
});
eaten = []; // reset buffer
} else {
let cp = input.pop();
if (cp == STOP) {
tokens.push({type: TY_STOP, cp});
} else if (VALID.has(cp)) {
tokens.push({type: TY_VALID, cps: [cp]});
} else if (IGNORED.has(cp)) {
tokens.push({type: TY_IGNORED, cp});
} else {
let cps = MAPPED.get(cp);
if (cps) {
tokens.push({type: TY_MAPPED, cp, cps: cps.slice()});
} else {
tokens.push({type: TY_DISALLOWED, cp});
}
}
}
}
if (nf) {
for (let i = 0, start = -1; i < tokens.length; i++) {
let token = tokens[i];
if (is_valid_or_mapped(token.type)) {
if (requires_check(token.cps)) { // normalization might be needed
let end = i + 1;
for (let pos = end; pos < tokens.length; pos++) { // find adjacent text
let {type, cps} = tokens[pos];
if (is_valid_or_mapped(type)) {
if (!requires_check(cps)) break;
end = pos + 1;
} else if (type !== TY_IGNORED) { // || type !== TY_DISALLOWED) {
break;
}
}
if (start < 0) start = i;
let slice = tokens.slice(start, end);
let cps0 = slice.flatMap(x => is_valid_or_mapped(x.type) ? x.cps : []); // strip junk tokens
let cps = nfc(cps0);
if (compare_arrays(cps, cps0)) { // bundle into an nfc token
tokens.splice(start, end - start, {
type: TY_NFC,
input: cps0, // there are 3 states: tokens0 ==(process)=> input ==(nfc)=> tokens/cps
cps,
tokens0: collapse_valid_tokens(slice),
tokens: ens_tokenize(str_from_cps(cps), {nf: false})
});
i = start;
} else {
i = end - 1; // skip to end of slice
}
start = -1; // reset
} else {
start = i; // remember last
}
} else if (token.type !== TY_IGNORED) { // 20221024: is this correct?
start = -1; // reset
}
}
}
return collapse_valid_tokens(tokens);
}
function is_valid_or_mapped(type) {
return type == TY_VALID || type == TY_MAPPED;
}
function requires_check(cps) {
return cps.some(cp => NFC_CHECK.has(cp));
}
function collapse_valid_tokens(tokens) {
for (let i = 0; i < tokens.length; i++) {
if (tokens[i].type == TY_VALID) {
let j = i + 1;
while (j < tokens.length && tokens[j].type == TY_VALID) j++;
tokens.splice(i, j - i, {type: TY_VALID, cps: tokens.slice(i, j).flatMap(x => x.cps)});
}
}
return tokens;
}
export { ens_beautify, ens_emoji, ens_normalize, ens_normalize_fragment, ens_split, ens_tokenize, is_combining_mark, nfc, nfd, safe_str_from_cps, should_escape };