unidata_properties.mjs

/** Holds data for unicode property names and values. Many of the names, aliases, and valid set
 * of values are extracted from the raw unicode source. The actual codepoint data comes from
 * the `node-unicode` library.
 * @namespace properties
 */
import * as unidata from "./unidata.mjs";
import { RangeGroup, UnicodeNormType } from "range-group";
import unicode_index from "@unicode/unicode-15.0.0";

/** Property name/value index structure
 * @typedef {object} PropertyIndex
 * @prop {object<string, string>} shorthand Map of property name to its shorthand alias
 * @prop {object<string, object<string,string>>} values For each shorthand property name, a map of
 * 	property value its shorthand alias
 * @prop {object<string, object<string,Codepoints>>} codepoints For each property name a map of
 * 	property value to {@link Codepoints} data
 * @memberof properties
 */

let loaded = false;
/** Raw index of property names and values
 * @type {PropertyIndex}
 * @memberof properties
 */
const names = {
	shorthand: {},
	values: {},
	codepoints: {}
};

/** Handles lazy loading a set of codepoints from the `node-unicode` library as a `RangeGroup` */
class Codepoints{
	/** Create a new lazy loader
	 * @param {string} path pathname inside `node-unicode` library, e.g. `"General_Category/Number"`
	 */
	constructor(path){
		this.path = path;
		this.group = null;
	}
	/** Get range group for these codepoints
	 * @returns {Promise<RangeGroup>}
	 */
	async get(){
		// lazy load
		if (this.group === null){
			const ranges = (await import(`@unicode/unicode-15.0.0/${this.path}/ranges.js`)).default;
			let r = this.group = new RangeGroup(ranges.map(r => {
				return {
					start: String.fromCodePoint(r.begin),
					// node-unicode uses exclusive end; can normalize directly
					end: String.fromCodePoint(r.end-1)
				};
			}), {type:UnicodeNormType});
			r.selfUnion();
		}
		return this.group;
	}
}

/** Lazily load the index of property names and values. The individual unicode codepoints for
 * each are lazily loaded as they are needed
 * @memberof properties
 */
async function load(){
	// already loaded?
	if (loaded)
		return;
	for await (const name of unidata.reader("PropertyAliases.txt")){
		const short = name[0];
		names.values[short] = {};
		for (const n of name)
			names.shorthand[n] = short;
	}
	// Note, a bunch of these actually have @missing annotations, which is problematic; seems manual
	// coding is the only way to handle those, perhaps requiring additional supplemental unidata
	// files; I'll add manual logic to defer to the @unicode library where appropriate
	for await (const value of unidata.reader("PropertyValueAliases.txt")){
		const name = value[0];
		const p = names.values[name];
		const short = value[1];
		for (let i=1; i<value.length; i++)
			p[value[i]] = short;
	}
	// Automatically map property names/values to those in @unicode lib
	const unknown = [];
	for (const l1 in unicode_index){
		// these are for character name aliases
		if (l1 === "Names")
			continue;
		const l2_lst = unicode_index[l1];
		// currently just empty for bidi mirroring glyph; we'll skip it
		if (!l2_lst.length)
			continue;
		const l1_norm = unidata.normalize(l1);
		// first is usually the property name, but could be a category of properties instead
		const l1_name = l1_norm in names.shorthand;
		if (l1_name)
			names.codepoints[l1_norm] = {};
		// property value or binary property
		for (const l2 of l2_lst){
			const l2_norm = unidata.normalize(l2);
			const path = l1+"/"+l2;
			let prop, val;
			// property value
			if (l1_name){
				prop = l1_norm;
				val = l2_norm;
			}
			// binary property
			else{
				if (!(l2_norm in names.shorthand)){
					// allow all from @unicode lib, except emojitest;
					if (l2_norm !== "emojitest"){
						names.shorthand[l2_norm] = l2_norm;
						// copy aliases from a known binary property
						names.values[l2_norm] = Object.assign({}, names.values.alpha);
					}
					else{
						unknown.push(path);
						continue;
					}
				}
				prop = l2_norm;
				val = 'y';
			}
			prop = names.shorthand[prop];
			let sval = names.values[prop][val];
			unknown: if (sval === undefined){
				// use @unicode's script extensions / case folding;
				// no alias data for these
				if (prop === "scx" || prop === "cf"){
					names.values[prop][val] = val;
					sval = val;
					break unknown;
				}
				unknown.push(path);
				continue;
			}
			val = sval;
			// good match; add codepoint lookup structure
			let cvals = names.codepoints[prop];
			if (cvals === undefined)
				cvals = names.codepoints[prop] = {};
			cvals[val] = new Codepoints(path);
			// console.log("match:", prop, val, path);
		}
	}
	// can log unknown array for debugging purposes
	// console.log("unicode properties without match:")
	// console.dir(unknown, {depth: null, maxArrayLength:Infinity});
	loaded = true;
}

/** Unicode characters associated with a property name-value pair
 * @typedef {object} PropertyCharacters
 * @prop {Codepoints} group container for characters
 * @prop {boolean} invert whether the group should be inverted (take the complement) to
 * 	meet the property name-value criteria; this can be set for binary properties
 * @memberof properties
 */

/** Fetch character data given by the unicode property name-value pair. This will throw an error
 * if the name-value pair is unknown.
 * @param {string} name property name, or if `value` is ommitted, it can be the property value
 * 	for General Category or Script properties
 * @param {?string} value property value; this can be blank, if `name` is a property value for
 *  General Category or Script properties, as mentioned above; otherwise, it defaults to "true",
 *  which will be a valid value for binary properties, but error for others
 * @returns {Promise<PropertyCharacters>}
 * @memberof properties
 */
async function get(name, value){
	if (!loaded)
		throw Error("Must call load first to load the property name/values data");
	let name_norm = unidata.normalize(name);
	let value_norm;
	validate: {
		// verify its a known name-value pair
		if (value){
			value_norm = unidata.normalize(value);
			name_norm = names.shorthand[name_norm];
			if (!name_norm)
				throw Error("Unknown/invalid unicode property name: "+name);
			value_norm = names.values[name_norm][value_norm];
			if (!value_norm)
				throw Error(`Unknown/invalid value for unicode property '${name}': ${value}`);
			break validate;
		}
		// resolve the missing value
		// from whitespace example given in spec, we always check for a binary category first
		const name_short = names.shorthand[name_norm];
		if (name_short){
			name_norm = name_short;
			value = "true";
			// verify its a binary property
			value_norm = names.values[name_norm][value];
			if (!value_norm)
				throw Error(`Property name '${name}' is not a binary property, so its property value is required`);
			break validate;
		}
		// if not a valid category, we allow a value for General_Category or Script properties
		else{
			const fallback = ["gc","sc"];
			for (const category of fallback){
				const value_short = names.values[category][name_norm];
				// found a match in this category!
				if (value_short){
					value = name;
					value_norm = value_short;
					name_norm = category;
					break validate;
				}
			}
			throw Error("Unknown/invalid unicode binary property name, or GeneralCategory/Script property value: "+name);
		}		
	}
	// binary properties can be inverted
	let invert = false;
	if (value_norm === 'n' && 'true' in names.values[name_norm]){
		invert = true;
		value_norm = 'y';
	}
	// name/value_norm are validated; check if we have a group for it
	const group = names.codepoints[name_norm]?.[value_norm];
	if (!group)
		throw Error(`No RangeGroup available for ${name}=${value}`);
	return {group, invert};
}

export { names, Codepoints, load, get };