parse.mjs

/** Parsing a UnicodeSet into an AST. So an earlier verison of this code I was using the node `ebnf`
 * library to do interpreted parsing. That was working well, but when I was about to publish I found
 * out it didn't handle unicode surrogate pairs. I tried ANTLR4 next, but found the need for a
 * separate lexer/parser unsuitable for the heavy context-sensitive tokens of UnicodeSets. I looked
 * into peg/peggy, but they did not have unicode support yet. Seemed simpler just to handwrite the
 * parser in the end.
 */
import * as characters from "./unidata/characters.mjs";
import * as properties from "./unidata/properties.mjs";

/** Handles reading UTF-32 characters from JavaScript string, with lookahead ability */
class CharacterStream{
	/** Create a new streamer object
	 * @param {string} pattern the pattern string to iterate over
	 */
	constructor(pattern){
		/** Pattern we're iterating over for this stream */
		this.pattern = pattern;
		/** Current line number, where lines are given by '\n' character */
		this.line = 0;
		/** Current column number in UTF-16 encoding */
		this.col_utf16 = 0;
		/** Current column number, in UTF-32 encoding */
		this.col_utf32 = 0;
		/** Buffered characters from the stream */
		this.buffer = [];
		/** Character stream generator */
		this.stream = this.#streamer(pattern);
	}
	/** Throw a syntax error at this point in the stream
	 * @param {string} expected message indicating what was expected; prefix/suffix is generated
	 * @param {number} [discard=0] how many buffered characters were okay; the next character
	 * 	(possibly buffered) after those discarded is the one that triggered the error
	 */
	error(expected, discard=0){
		if (discard)
			this.discard(discard);
		const err = new SyntaxError(`Expected ${expected} at line #${this.line}, char #${this.col_utf32}`);
		err.line = this.line;
		err.col_utf16 = this.col_utf16;
		err.col_utf32 = this.col_utf32;
		err.valid = this.pattern.substring(0, err.col_utf16);
		err.invalid = this.pattern.substring(err.col_utf16);
		throw err;
	}
	/** Create the character stream generator */
	*#streamer(pattern){
		// will properly handle surrogates
		for (const char of pattern){
			// track line/chars for useful error messages
			this.col_utf16 += char.length;
			this.col_utf32++;
			if (char === '\n')
				this.line++;
			yield char;
		}
	}
	/** Ensure at least n>0 characters are buffered if possible
	 * @param {number} [n=1] number of characters to buffer
	 * @returns {boolean} true if we could fulfill buffer request
	 */
	buffer(n=1){
		let needed = n-this.buffer.length;
		while (needed > 0){
			const v = this.stream.next();
			// not enough characters
			if (v.done)
				return false;
			this.buffer.push(v.value);
			needed--;
		}
		return true;
	}
	/** Discard n>0 characters; throws if there are not n characters
	 * @param {number} [n=1] number of characters to discard
	 * @returns {boolean} true if we could fulfill discard request
	 */
	discard(n=1){
		if (this.buffer.length > n)
			this.buffer.splice(0, n);
		else{
			n -= this.buffer.length;
			// discard additional non-buffered?
			while (n){
				if (this.stream.next().done)
					return false;
				n--;
			}
			this.buffer.length = 0;
		}
		return true;
	}
	/** Fetch n>0 characters as a string
	 * @param {number} [n=1] number of characters to peek
	 * @returns {?string} the peeked string; null if we can't get n characters
	 */
	peek(n=1){
		const l = this.buffer.length;
		let str;
		if (l){
			if (n === 1)
				return this.buffer[0];
			if (l > n){
				str = "";
				for (let i=0; i<n; i++)
					str += this.buffer[i];
				return str;
			}
			str = this.buffer.join("");
			n -= l;
		}
		else str = "";
		// add excess non-buffered?
		while (n){
			const v = this.stream.next();
			// not enough characters
			if (v.done)
				return null;
			str += v.value;
			this.buffer.push(v.value);
			n--;
		}
		return str;
	}
	/** Fetch n>0 characters and discard them
	 * @param {number} [n=1] number of characters to pop
	 * @returns {?string} the popped string; null if we can't get n characters
	 */
	pop(n=1){
		const l = this.buffer.length;
		if (l > n)
			return this.buffer.splice(0, n).join("");
		let str;
		if (l){
			str = l === 1 ? this.buffer[0] : this.buffer.join("");
			n -= l;
			this.buffer.length = 0;
		}
		else str = "";
		// more non-buffered chars needed?
		while (n){
			const v = this.stream.next();
			// can't fulfill request?
			if (v.done)
				return null;
			str += v.value;
			n--;
		}
		return str;
	}
}

/** AST node generated by the parser
 * @typedef {object} Node
 * @prop {string} type The node type; can be one of the following:
 * - `empty`: an empty set
 * - `group`: a group of 2+ sets to be combined with set operations
 * - `property`: a set defined by unicode property name/value
 * - `char`: single characters
 * - `charRange`: character ranges
 * - `string`: single strings
 * - `stringRange`: string ranges
 * @prop {string} op A set operation indicating how it should be combined with the
 *  previous set in a group. One of: `union`, `difference`, `intersect`, or `symmetricDifference`
 * @prop {boolean} invert Whether the set is inverted (e.g. the unary complement set operation)
 * @prop {?string} name Only present for `property` type; it is the extracted property name
 * @prop {?any} value Absent for `empty` type. Otherwise:
 * - `group`: a list of nested {@link Node}
 * - `property`: the extracted property value string
 * - `char` and `string`: a list of strings
 * - `charRange` and `stringRange`: a list of string tuples, giving range `[start, end]`
 * @prop {?Codepoints} group Only present for `property` type; it represents the set of codepoints
 * 	for that unicode property name-value pair
 */

// Not using regex here since its finicky with surrogate pairs
function is_ws(c){
	// pattern whitespace
	return c?.length === 1 && (
		(c >= '\x09' && c <= '\x0D') ||
		"\x20\x85\u200E\u200F\u2028\u2029".indexOf(c) !== -1);
}
function is_hex(c){
	return c?.length === 1 && (
		(c >= 'a' && c <= 'f') ||
		(c >= 'A' && c <= 'F') ||
		(c >= '0' && c <= '9'));
}

/** Handwritten parser to parse UnicodeSet patterns as an abstract syntax tree.
 * Friendly error messages throughout
 */
class Parser{
	/** Parse a UnicodeSet pattern into an abstract syntax tree (AST)
	 * @param {string} pattern the pattern to parse
	 * @returns {Promise<Node>}
	 */
	async parse(pattern){
		this.stream = new CharacterStream(pattern);
		const ast = await this.root();
		// ensure EOF
		const c = this.pop_nonws();
		if (c !== null)
			this.stream.error("end-of-string");
		return ast;				
	}
	/** Root set, enclosing all others
	 * @returns {Promise<Node>}
	 */
	root(){
		let c = this.pop_nonws();
		if (c === '['){
			c = this.stream.peek();
			if (c === ':'){
				this.stream.discard();
				return this.posixProperty();
			}
			return this.sequence();
		}
		else if (c === '\\')
			return this.perlProperty();
		this.stream.error("[ denoting set start, or a Perl-style property");
	}

	//// PROPERTIES

	/** Extract node for Posix-style property
	 * @returns {Promise<Node>}
	 */
	async posixProperty(){
		const node = await this.property(false, ':');
		const c = this.stream.pop();
		if (c !== ']')
			this.stream.error("] to end the property");
		return node;
	}
	/** Extract node for Perl-style property
	 * @returns {Promise<Node>}
	 */
	async perlProperty(){
		let c = this.stream.pop();
		let invert = false;
		if (c === 'P')
			invert = true;
		else if (c !== 'p')
			this.stream.error("p or P must follow this backslash for a Perl-style property");
		c = this.pop_nonws();
		if (c !== '{')
			this.stream.error("{ to enclose Perl-style property");
		return await this.property(invert, '}');
	}
	/** Extract generic node for Perl/Posix property
	 * @param {boolean} invert starting inverted state
	 * @param {string} stop_char single character signaling end of property name-value definition
	 * @returns {Promise<Node>}
	 */
	async property(invert, stop_char){
		const property_loader = properties.load();
		/** Builder for property name */
		let name = "";
		/** Builder for property value */
		let value = "";
		/** Property name finalized via equality? */
		let name_done = false;

		// property matching discards whitespace, so okay to exclude here
		let c = await this.pop_nonws_quoted();
		if (!c.quoted && c.char === '^'){
			invert = !invert;
			c = await this.pop_nonws_quoted();
		}
		while (true){
			append: {
				if (!c.quoted){
					if (!name_done){
						const equal = c.char === '=';
						if (equal || c.char === '≠'){
							// not-equal inverts
							if (!equal)
								invert = !invert;
							name_done = true;
							if (!name)
								this.stream.error("property name");
							break append;
						}
					}
					// end property
					if (c.char === stop_char){
						// if equality was specified, must have non-empty value
						if (name_done && !value)
							this.stream.error("property value");
						break;
					}
					if (c.char === null)
						this.stream.error("remainder of property definition");
				}
				if (name_done)
					value += c.char;
				else name += c.char;
			}
			c = await this.pop_nonws_quoted();
		}

		// lookup property
		await property_loader;
		const group = await properties.get(name, value);
		if (group.invert)
			invert = !invert;
		return {type:"property", op:"union", invert, name, value, group: group.group};
	}

	//// SEQUENCE (set ops / enumerated set)

	/** A sequence is surrounded by brackets, and contains either an enumerated or composed set
	 * @returns {Promise<Node>}
	 */
	async sequence(){
		/* whether this is a set op or enumerated set is unknown at the moment
			setop = first chars are open bracket [ or \[pP] (perl property)
			enuemrated = all else
		*/
		let enumerated = true;
		let invert = false;
		// at most loops twice when inverting
		while (true){
			const c = this.peek_nonws();			
			// first character can invert
			if (!invert && (c === '^' || c === '-')){
				invert = !invert;
				this.stream.discard();
				continue;
			}
			// lookahead for composed set
			if (c === '[')
				enumerated = false;
			else if (c === '\\'){
				const c2 = this.stream.peek(2);
				if (c2 === '\\p' || c2 === '\\P')
					enumerated = false;
			}
			break;
		}
		/** List of nodes in sequence
		 * @type {Node[]}
		 */
		const value = await (enumerated ? this.enumerated() : this.composed());
		// empty set
		if (!value.length)
			return {type:"empty", op:"union", invert};
		// single value in sequence?
		if (value.length === 1){
			const node = value[0];
			if (invert)
				node.invert = !node.invert;
			return node;
		}
		// since operators have left-to-right precedence, we can merge w/ first if group
		const first = value[0];
		if (first.type === "group" && !first.invert){
			const node = value.shift();
			node.invert = invert;
			node.value.push(...value);
			return node;
		}			
		// otherwise make it a group
		return {type:"group", op:"union", invert, value};
	}
	/** A composition of set operations combining various other sets
	 * @returns {Promise<Node>}
	 */
	async composed(){
		const value = [];
		let op = "union";
		loop: while (true){
			// set operand
			const node = await this.root();
			node.op = op;
			value.push(node);
			// set operator
			switch (this.peek_nonws()){
				case '+': op = "union"; break;
				case '&': op = "intersect"; break;
				case '-': op = "difference"; break;
				case '~': op = "symmetricDifference"; break;
				case '[':
				case '\\':
					op = "union";
					continue loop;
				case ']':
					this.stream.discard();
					break loop;
				case null:
					this.stream.error("] to end composed set")
				default:
					this.stream.error("set operator, set operand, or ] to end composed set");
			}
			this.stream.discard();
		}
		return value;
	}
	/** An enumeration of characters and strings
	 * @returns {Promise<Node>}
	 */
	async enumerated(){
		/** Accumulated values; map from node type to values list
		 * @type {object<string,list>}
		 */
		let values = {};
		/** Range expected? */
		let range = false;
		/** Potential start of range, null if no range expected
		 * @type {?{char: string, length: number, is_char: boolean}}
		 */
		let start = null;

		/** Add a value of a certain type to `values` */
		function push(type, ...items){
			const arr = values[type];
			if (!arr)
				values[type] = items;
			else arr.push(...items);
		}
		/** Flush start as singleton node */
		const flush = (cur) => {
			if (range){
				// character range mismatch already detected
				if (cur.is_char !== start.is_char)
					this.stream.error("end of string range");
				// validate length
				if (cur.is_char){
					if (start.length !== 1 || cur.length !== 1)
						this.stream.error("range start and end to both be exactly one character");
					// validate codepoints are ascending
					if (start.char.codePointAt(0) > cur.char.codePointAt(0))
						this.stream.error("character range start to be <= end");
				}
				else{
					if (start.length < cur.length)
						this.stream.error("string range start length >= end length");
					// break into codepoints
					const scp = Array.from(start.char);
					const ecp = Array.from(cur.char);
					// extract shared prefix
					const prefix_length = start.length-cur.length;
					if (prefix_length)
						cur.char = scp.splice(0, prefix_length).join("")+cur.char;
					// validate codepoints are ascending
					for (let i=0; i<scp.length; i++){
						const sin = scp[i].codePointAt(0);
						const ein = ecp[i].codePointAt(0);
						if (sin > ein)
							this.stream.error(
								`string range start character '${scp[i]}' `+
								`to be <= end character '${ecp[i]}'`
							);
					}
				}
				// if equal start/end, don't make it a range
				if (start.char !== cur.char){
					push(start.is_char ? "charRange" : "stringRange", [start.char, cur.char]);
					start = null;
					range = false;
					return;
				}
			}
			// not a range
			if (start && start.length){
				const t = start.is_char ? "char" : "string";
				// split out multiple characters into list
				if (!start.is_char || start.length === 1)
					push(t, start.char)
				else push(t, ...start.char);
			}
			// reduction from range to singleton?
			if (range){
				range = false;
				start = null;
			}
			// buffer current value in case its part of a range			
			else start = cur;
		};

		loop: while (true){
			const c = await this.pop_nonws_quoted();
			/** Current string or character token */
			let cur = c;
			cur.is_char = true;
			if (!c.quoted){
				switch (c.char){
					case ']':
					case '-':
					case null:
						if (range)
							this.stream.error(`end of ${start.is_char ? 'character' : 'string'} range`);
						if (c.char === ']'){
							if (start)
								flush(null);
							break loop;
						}
						if (c.char === null)
							this.stream.error("] to end enumerated set");
						// otherwise, range separator
						if (!start)
							this.stream.error("starting character or string for this range");
						range = true;
						continue loop;
					case '{':
						if (range && start.is_char)
							this.stream.error("end of character range");
						cur = await this.string();
						break;						
				}
			}
			flush(cur);
		}
		// build nodes
		let value = [];
		for (const k in values){
			const v = values[k];
			value.push({type:k, op:"union", invert:false, value:v});
		}
		return value;
	}
	/** Extract a string
	 * @returns {Promise<{char: string, is_char: false, length: number}>}
	 */
	async string(){
		let value = "";
		let length = 0;
		while (true){
			const c = await this.pop_nonws_quoted();
			if (!c.quoted){
				if (c.char === '}')
					break;
				if (c.char === null)
					this.stream.error("} to end string");
			}
			value += c.char;
			length += c.length;
		}
		return {char: value, length, is_char: false};
	}

	//// QUOTED CHARACTERS

	/** Extract quoted character(s)
	 * @returns {Promise<{char: string, length: number}>}
	 */
	async quoted(){
		let c = this.stream.pop();
		switch (c){
			case 'x': return this.quotedCodepoint(2, true);
			case 'u': return this.quotedCodepoint(4, true);
			case 'U': return this.quotedCodepoint(8, false);
			case 'N': return this.quotedName();
			case 'a': c = '\x07'; break;
			case 'b': c = '\b'; break;
			case 't': c = '\t'; break;
			case 'n': c = '\n'; break;
			case 'v': c = '\v'; break;
			case 'f': c = '\f'; break;
			case 'r': c = '\r'; break;
			case null:
				this.stream.error("quoted character");
		}
		// all others resolve to the same character
		return {char: c, length: 1};
	}
	/** Fetch a hex codepoint, optionally a list of such codepoints
	 * @param {number} length how many hex chars are allowed for non lists
	 * @param {boolean} list whether a list of codepoints is allowed
	 * @returns {{char: string, length: number}}
	 */
	quotedCodepoint(length, list){
		let c = this.stream.pop();

		// single codepoint
		if (is_hex(c)){
			let digit = c;
			while (--length){
				c = this.stream.pop();
				if (!is_hex(c))
					this.stream.error(`${length} hex digits to complete quoted codepoint`);
				digit += c;
			}
			digit = parseInt(digit, 16);
			if (digit > 0x10FFFF)
				this.stream.error(`codepoint to be <= 10FFFF`);
			return {char: String.fromCodePoint(digit), length: 1};
		}
		else if (!list)
			this.stream.error(`${length} hex digits for this quoted codepoint`);

		// list of codepoints
		if (is_ws(c))
			c = this.pop_nonws();
		if (c !== '{')
			this.stream.error(`${length} hex digits or a hex list for this quoted codepoint`);
		c = this.pop_nonws();
		/** List of codepoints we've extracted */
		list = [];
		/** Current digit we're building */
		let digit = "";
		while (true){
			// build codepoint
			while (is_hex(c)){
				digit += c;
				c = this.stream.pop();
			}
			// emit codepoint (possibly none)
			if (digit.length){
				digit = parseInt(digit, 16);
				if (digit > 0x10FFFF)
					this.stream.error(`codepoint to be <= 10FFFF`);
				list.push(digit);
				digit = "";
			}
			// end of list?
			if (c === '}')
				break;
			// whitespace separates list items
			if (is_ws(c)){
				c = this.pop_nonws();
				continue;
			}
			// else bad character
			this.stream.error(`only whitespace and hex digits inside quoted list of codepoints`);
		}
		return {char: String.fromCodePoint(...list), length: list.length};
	}
	/** Fetch name via unicode character name
	 * @returns {Promise<{char: string, length: number}>}
	 */
	async quotedName(){
		let name = "";
		let c = this.pop_nonws();
		if (c !== '{')
			this.stream.error("{ to start quoted character name");
		while (true){
			const c = this.stream.pop();
			if (c === '}')
				break;
			if (c === null)
				this.stream.error("} to end quoted character name");
			name += c;
		}
		// unicode lookup
		await characters.load();
		return {char: characters.get(name), length: 1};
	}
	
	//// STREAM HELPERS

	/** Peek the first non-whitespace character
	 * @returns {?string}
	 */
	peek_nonws(){
		while (true){
			const c = this.stream.peek();
			if (!is_ws(c))
				return c;
			this.stream.discard();
		}
	}
	/** Pop the first non-whitespace character
	 * @returns {?string}
	 */
	pop_nonws(){
		while (true){
			const c = this.stream.pop();
			if (!is_ws(c))
				return c;
		}
	}
	/** Pop non-whitespace, resolving quoted character(s) if needed
	 * @returns {Promise<{char: string, quoted: boolean, length: number}>}
	 */
	async pop_nonws_quoted(){
		let c = this.pop_nonws();
		if (c === '\\'){
			c = await this.quoted();
			c.quoted = true;
			return c;
		}
		return {char: c, length: 1, quoted: false};
	}
}

const parser = new Parser();
/** Parse a UnicodeSet pattern into an abstract syntax tree (AST). Some trivial transformations are
 * performed on the tree to reduce groups, combine invert/complement, and combine nodes. The parsing
 * is async, as it lazily loads unicode data as it becomes needed.
 * @name UnicodeSet.parse
 * @function
 * @param {string} pattern UnicodeSet pattern
 * @returns {Promise<Node>} the AST root node
 */
function parse(pattern){
	return parser.parse(pattern);
}

export { CharacterStream, Parser, parse };