/** Parsing a UnicodeSet into an AST. So an earlier verison of this code I was using the node `ebnf`
* library to do interpreted parsing. That was working well, but when I was about to publish I found
* out it didn't handle unicode surrogate pairs. I tried ANTLR4 next, but found the need for a
* separate lexer/parser unsuitable for the heavy context-sensitive tokens of UnicodeSets. I looked
* into peg/peggy, but they did not have unicode support yet. Seemed simpler just to handwrite the
* parser in the end.
*/
import * as characters from "./unidata/characters.mjs";
import * as properties from "./unidata/properties.mjs";
/** Handles reading UTF-32 characters from JavaScript string, with lookahead ability */
class CharacterStream{
/** Create a new streamer object
* @param {string} pattern the pattern string to iterate over
*/
constructor(pattern){
/** Pattern we're iterating over for this stream */
this.pattern = pattern;
/** Current line number, where lines are given by '\n' character */
this.line = 0;
/** Current column number in UTF-16 encoding */
this.col_utf16 = 0;
/** Current column number, in UTF-32 encoding */
this.col_utf32 = 0;
/** Buffered characters from the stream */
this.buffer = [];
/** Character stream generator */
this.stream = this.#streamer(pattern);
}
/** Throw a syntax error at this point in the stream
* @param {string} expected message indicating what was expected; prefix/suffix is generated
* @param {number} [discard=0] how many buffered characters were okay; the next character
* (possibly buffered) after those discarded is the one that triggered the error
*/
error(expected, discard=0){
if (discard)
this.discard(discard);
const err = new SyntaxError(`Expected ${expected} at line #${this.line}, char #${this.col_utf32}`);
err.line = this.line;
err.col_utf16 = this.col_utf16;
err.col_utf32 = this.col_utf32;
err.valid = this.pattern.substring(0, err.col_utf16);
err.invalid = this.pattern.substring(err.col_utf16);
throw err;
}
/** Create the character stream generator */
*#streamer(pattern){
// will properly handle surrogates
for (const char of pattern){
// track line/chars for useful error messages
this.col_utf16 += char.length;
this.col_utf32++;
if (char === '\n')
this.line++;
yield char;
}
}
/** Ensure at least n>0 characters are buffered if possible
* @param {number} [n=1] number of characters to buffer
* @returns {boolean} true if we could fulfill buffer request
*/
buffer(n=1){
let needed = n-this.buffer.length;
while (needed > 0){
const v = this.stream.next();
// not enough characters
if (v.done)
return false;
this.buffer.push(v.value);
needed--;
}
return true;
}
/** Discard n>0 characters; throws if there are not n characters
* @param {number} [n=1] number of characters to discard
* @returns {boolean} true if we could fulfill discard request
*/
discard(n=1){
if (this.buffer.length > n)
this.buffer.splice(0, n);
else{
n -= this.buffer.length;
// discard additional non-buffered?
while (n){
if (this.stream.next().done)
return false;
n--;
}
this.buffer.length = 0;
}
return true;
}
/** Fetch n>0 characters as a string
* @param {number} [n=1] number of characters to peek
* @returns {?string} the peeked string; null if we can't get n characters
*/
peek(n=1){
const l = this.buffer.length;
let str;
if (l){
if (n === 1)
return this.buffer[0];
if (l > n){
str = "";
for (let i=0; i<n; i++)
str += this.buffer[i];
return str;
}
str = this.buffer.join("");
n -= l;
}
else str = "";
// add excess non-buffered?
while (n){
const v = this.stream.next();
// not enough characters
if (v.done)
return null;
str += v.value;
this.buffer.push(v.value);
n--;
}
return str;
}
/** Fetch n>0 characters and discard them
* @param {number} [n=1] number of characters to pop
* @returns {?string} the popped string; null if we can't get n characters
*/
pop(n=1){
const l = this.buffer.length;
if (l > n)
return this.buffer.splice(0, n).join("");
let str;
if (l){
str = l === 1 ? this.buffer[0] : this.buffer.join("");
n -= l;
this.buffer.length = 0;
}
else str = "";
// more non-buffered chars needed?
while (n){
const v = this.stream.next();
// can't fulfill request?
if (v.done)
return null;
str += v.value;
n--;
}
return str;
}
}
/** AST node generated by the parser
* @typedef {object} Node
* @prop {string} type The node type; can be one of the following:
* - `empty`: an empty set
* - `group`: a group of 2+ sets to be combined with set operations
* - `property`: a set defined by unicode property name/value
* - `char`: single characters
* - `charRange`: character ranges
* - `string`: single strings
* - `stringRange`: string ranges
* @prop {string} op A set operation indicating how it should be combined with the
* previous set in a group. One of: `union`, `difference`, `intersect`, or `symmetricDifference`
* @prop {boolean} invert Whether the set is inverted (e.g. the unary complement set operation)
* @prop {?string} name Only present for `property` type; it is the extracted property name
* @prop {?any} value Absent for `empty` type. Otherwise:
* - `group`: a list of nested {@link Node}
* - `property`: the extracted property value string
* - `char` and `string`: a list of strings
* - `charRange` and `stringRange`: a list of string tuples, giving range `[start, end]`
* @prop {?Codepoints} group Only present for `property` type; it represents the set of codepoints
* for that unicode property name-value pair
*/
// Not using regex here since its finicky with surrogate pairs
function is_ws(c){
// pattern whitespace
return c?.length === 1 && (
(c >= '\x09' && c <= '\x0D') ||
"\x20\x85\u200E\u200F\u2028\u2029".indexOf(c) !== -1);
}
function is_hex(c){
return c?.length === 1 && (
(c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F') ||
(c >= '0' && c <= '9'));
}
/** Handwritten parser to parse UnicodeSet patterns as an abstract syntax tree.
* Friendly error messages throughout
*/
class Parser{
/** Parse a UnicodeSet pattern into an abstract syntax tree (AST)
* @param {string} pattern the pattern to parse
* @returns {Promise<Node>}
*/
async parse(pattern){
this.stream = new CharacterStream(pattern);
const ast = await this.root();
// ensure EOF
const c = this.pop_nonws();
if (c !== null)
this.stream.error("end-of-string");
return ast;
}
/** Root set, enclosing all others
* @returns {Promise<Node>}
*/
root(){
let c = this.pop_nonws();
if (c === '['){
c = this.stream.peek();
if (c === ':'){
this.stream.discard();
return this.posixProperty();
}
return this.sequence();
}
else if (c === '\\')
return this.perlProperty();
this.stream.error("[ denoting set start, or a Perl-style property");
}
//// PROPERTIES
/** Extract node for Posix-style property
* @returns {Promise<Node>}
*/
async posixProperty(){
const node = await this.property(false, ':');
const c = this.stream.pop();
if (c !== ']')
this.stream.error("] to end the property");
return node;
}
/** Extract node for Perl-style property
* @returns {Promise<Node>}
*/
async perlProperty(){
let c = this.stream.pop();
let invert = false;
if (c === 'P')
invert = true;
else if (c !== 'p')
this.stream.error("p or P must follow this backslash for a Perl-style property");
c = this.pop_nonws();
if (c !== '{')
this.stream.error("{ to enclose Perl-style property");
return await this.property(invert, '}');
}
/** Extract generic node for Perl/Posix property
* @param {boolean} invert starting inverted state
* @param {string} stop_char single character signaling end of property name-value definition
* @returns {Promise<Node>}
*/
async property(invert, stop_char){
const property_loader = properties.load();
/** Builder for property name */
let name = "";
/** Builder for property value */
let value = "";
/** Property name finalized via equality? */
let name_done = false;
// property matching discards whitespace, so okay to exclude here
let c = await this.pop_nonws_quoted();
if (!c.quoted && c.char === '^'){
invert = !invert;
c = await this.pop_nonws_quoted();
}
while (true){
append: {
if (!c.quoted){
if (!name_done){
const equal = c.char === '=';
if (equal || c.char === '≠'){
// not-equal inverts
if (!equal)
invert = !invert;
name_done = true;
if (!name)
this.stream.error("property name");
break append;
}
}
// end property
if (c.char === stop_char){
// if equality was specified, must have non-empty value
if (name_done && !value)
this.stream.error("property value");
break;
}
if (c.char === null)
this.stream.error("remainder of property definition");
}
if (name_done)
value += c.char;
else name += c.char;
}
c = await this.pop_nonws_quoted();
}
// lookup property
await property_loader;
const group = await properties.get(name, value);
if (group.invert)
invert = !invert;
return {type:"property", op:"union", invert, name, value, group: group.group};
}
//// SEQUENCE (set ops / enumerated set)
/** A sequence is surrounded by brackets, and contains either an enumerated or composed set
* @returns {Promise<Node>}
*/
async sequence(){
/* whether this is a set op or enumerated set is unknown at the moment
setop = first chars are open bracket [ or \[pP] (perl property)
enuemrated = all else
*/
let enumerated = true;
let invert = false;
// at most loops twice when inverting
while (true){
const c = this.peek_nonws();
// first character can invert
if (!invert && (c === '^' || c === '-')){
invert = !invert;
this.stream.discard();
continue;
}
// lookahead for composed set
if (c === '[')
enumerated = false;
else if (c === '\\'){
const c2 = this.stream.peek(2);
if (c2 === '\\p' || c2 === '\\P')
enumerated = false;
}
break;
}
/** List of nodes in sequence
* @type {Node[]}
*/
const value = await (enumerated ? this.enumerated() : this.composed());
// empty set
if (!value.length)
return {type:"empty", op:"union", invert};
// single value in sequence?
if (value.length === 1){
const node = value[0];
if (invert)
node.invert = !node.invert;
return node;
}
// since operators have left-to-right precedence, we can merge w/ first if group
const first = value[0];
if (first.type === "group" && !first.invert){
const node = value.shift();
node.invert = invert;
node.value.push(...value);
return node;
}
// otherwise make it a group
return {type:"group", op:"union", invert, value};
}
/** A composition of set operations combining various other sets
* @returns {Promise<Node>}
*/
async composed(){
const value = [];
let op = "union";
loop: while (true){
// set operand
const node = await this.root();
node.op = op;
value.push(node);
// set operator
switch (this.peek_nonws()){
case '+': op = "union"; break;
case '&': op = "intersect"; break;
case '-': op = "difference"; break;
case '~': op = "symmetricDifference"; break;
case '[':
case '\\':
op = "union";
continue loop;
case ']':
this.stream.discard();
break loop;
case null:
this.stream.error("] to end composed set")
default:
this.stream.error("set operator, set operand, or ] to end composed set");
}
this.stream.discard();
}
return value;
}
/** An enumeration of characters and strings
* @returns {Promise<Node>}
*/
async enumerated(){
/** Accumulated values; map from node type to values list
* @type {object<string,list>}
*/
let values = {};
/** Range expected? */
let range = false;
/** Potential start of range, null if no range expected
* @type {?{char: string, length: number, is_char: boolean}}
*/
let start = null;
/** Add a value of a certain type to `values` */
function push(type, ...items){
const arr = values[type];
if (!arr)
values[type] = items;
else arr.push(...items);
}
/** Flush start as singleton node */
const flush = (cur) => {
if (range){
// character range mismatch already detected
if (cur.is_char !== start.is_char)
this.stream.error("end of string range");
// validate length
if (cur.is_char){
if (start.length !== 1 || cur.length !== 1)
this.stream.error("range start and end to both be exactly one character");
// validate codepoints are ascending
if (start.char.codePointAt(0) > cur.char.codePointAt(0))
this.stream.error("character range start to be <= end");
}
else{
if (start.length < cur.length)
this.stream.error("string range start length >= end length");
// break into codepoints
const scp = Array.from(start.char);
const ecp = Array.from(cur.char);
// extract shared prefix
const prefix_length = start.length-cur.length;
if (prefix_length)
cur.char = scp.splice(0, prefix_length).join("")+cur.char;
// validate codepoints are ascending
for (let i=0; i<scp.length; i++){
const sin = scp[i].codePointAt(0);
const ein = ecp[i].codePointAt(0);
if (sin > ein)
this.stream.error(
`string range start character '${scp[i]}' `+
`to be <= end character '${ecp[i]}'`
);
}
}
// if equal start/end, don't make it a range
if (start.char !== cur.char){
push(start.is_char ? "charRange" : "stringRange", [start.char, cur.char]);
start = null;
range = false;
return;
}
}
// not a range
if (start && start.length){
const t = start.is_char ? "char" : "string";
// split out multiple characters into list
if (!start.is_char || start.length === 1)
push(t, start.char)
else push(t, ...start.char);
}
// reduction from range to singleton?
if (range){
range = false;
start = null;
}
// buffer current value in case its part of a range
else start = cur;
};
loop: while (true){
const c = await this.pop_nonws_quoted();
/** Current string or character token */
let cur = c;
cur.is_char = true;
if (!c.quoted){
switch (c.char){
case ']':
case '-':
case null:
if (range)
this.stream.error(`end of ${start.is_char ? 'character' : 'string'} range`);
if (c.char === ']'){
if (start)
flush(null);
break loop;
}
if (c.char === null)
this.stream.error("] to end enumerated set");
// otherwise, range separator
if (!start)
this.stream.error("starting character or string for this range");
range = true;
continue loop;
case '{':
if (range && start.is_char)
this.stream.error("end of character range");
cur = await this.string();
break;
}
}
flush(cur);
}
// build nodes
let value = [];
for (const k in values){
const v = values[k];
value.push({type:k, op:"union", invert:false, value:v});
}
return value;
}
/** Extract a string
* @returns {Promise<{char: string, is_char: false, length: number}>}
*/
async string(){
let value = "";
let length = 0;
while (true){
const c = await this.pop_nonws_quoted();
if (!c.quoted){
if (c.char === '}')
break;
if (c.char === null)
this.stream.error("} to end string");
}
value += c.char;
length += c.length;
}
return {char: value, length, is_char: false};
}
//// QUOTED CHARACTERS
/** Extract quoted character(s)
* @returns {Promise<{char: string, length: number}>}
*/
async quoted(){
let c = this.stream.pop();
switch (c){
case 'x': return this.quotedCodepoint(2, true);
case 'u': return this.quotedCodepoint(4, true);
case 'U': return this.quotedCodepoint(8, false);
case 'N': return this.quotedName();
case 'a': c = '\x07'; break;
case 'b': c = '\b'; break;
case 't': c = '\t'; break;
case 'n': c = '\n'; break;
case 'v': c = '\v'; break;
case 'f': c = '\f'; break;
case 'r': c = '\r'; break;
case null:
this.stream.error("quoted character");
}
// all others resolve to the same character
return {char: c, length: 1};
}
/** Fetch a hex codepoint, optionally a list of such codepoints
* @param {number} length how many hex chars are allowed for non lists
* @param {boolean} list whether a list of codepoints is allowed
* @returns {{char: string, length: number}}
*/
quotedCodepoint(length, list){
let c = this.stream.pop();
// single codepoint
if (is_hex(c)){
let digit = c;
while (--length){
c = this.stream.pop();
if (!is_hex(c))
this.stream.error(`${length} hex digits to complete quoted codepoint`);
digit += c;
}
digit = parseInt(digit, 16);
if (digit > 0x10FFFF)
this.stream.error(`codepoint to be <= 10FFFF`);
return {char: String.fromCodePoint(digit), length: 1};
}
else if (!list)
this.stream.error(`${length} hex digits for this quoted codepoint`);
// list of codepoints
if (is_ws(c))
c = this.pop_nonws();
if (c !== '{')
this.stream.error(`${length} hex digits or a hex list for this quoted codepoint`);
c = this.pop_nonws();
/** List of codepoints we've extracted */
list = [];
/** Current digit we're building */
let digit = "";
while (true){
// build codepoint
while (is_hex(c)){
digit += c;
c = this.stream.pop();
}
// emit codepoint (possibly none)
if (digit.length){
digit = parseInt(digit, 16);
if (digit > 0x10FFFF)
this.stream.error(`codepoint to be <= 10FFFF`);
list.push(digit);
digit = "";
}
// end of list?
if (c === '}')
break;
// whitespace separates list items
if (is_ws(c)){
c = this.pop_nonws();
continue;
}
// else bad character
this.stream.error(`only whitespace and hex digits inside quoted list of codepoints`);
}
return {char: String.fromCodePoint(...list), length: list.length};
}
/** Fetch name via unicode character name
* @returns {Promise<{char: string, length: number}>}
*/
async quotedName(){
let name = "";
let c = this.pop_nonws();
if (c !== '{')
this.stream.error("{ to start quoted character name");
while (true){
const c = this.stream.pop();
if (c === '}')
break;
if (c === null)
this.stream.error("} to end quoted character name");
name += c;
}
// unicode lookup
await characters.load();
return {char: characters.get(name), length: 1};
}
//// STREAM HELPERS
/** Peek the first non-whitespace character
* @returns {?string}
*/
peek_nonws(){
while (true){
const c = this.stream.peek();
if (!is_ws(c))
return c;
this.stream.discard();
}
}
/** Pop the first non-whitespace character
* @returns {?string}
*/
pop_nonws(){
while (true){
const c = this.stream.pop();
if (!is_ws(c))
return c;
}
}
/** Pop non-whitespace, resolving quoted character(s) if needed
* @returns {Promise<{char: string, quoted: boolean, length: number}>}
*/
async pop_nonws_quoted(){
let c = this.pop_nonws();
if (c === '\\'){
c = await this.quoted();
c.quoted = true;
return c;
}
return {char: c, length: 1, quoted: false};
}
}
const parser = new Parser();
/** Parse a UnicodeSet pattern into an abstract syntax tree (AST). Some trivial transformations are
* performed on the tree to reduce groups, combine invert/complement, and combine nodes. The parsing
* is async, as it lazily loads unicode data as it becomes needed.
* @name UnicodeSet.parse
* @function
* @param {string} pattern UnicodeSet pattern
* @returns {Promise<Node>} the AST root node
*/
function parse(pattern){
return parser.parse(pattern);
}
export { CharacterStream, Parser, parse };