types_helpers_unicode.mjs

/** Helpers for implementing methods in {@link StringRange} and {@link UnicodeType}. They are
 * provided publicly in case they can be useful for building other custom string-based types.
 * @namespace UnicodeHelpers
 */

/** Count number of unicode codepoints in a string, e.g. the UTF-32 encoded string length
 * @memberof UnicodeHelpers
 * @param {string} str the string to get the length of
 * @returns {number} the string length
 */
function length(str){
	let l = 0;
	for (const _ of str)
		l++;
	return l;
}
/** Get the UTF-16 string length for a given codepoint
 * @memberof UnicodeHelpers
 * @param {number} codepoint the unicode codepoint
 * @returns {number} two if its a surrogate pair, else one
 */
function utf16Length(codepoint){
	return 1 + (codepoint >= 0x10000);
}
/** Compare the length of two strings, using the count of unicode codepoints
 * @memberof UnicodeHelpers
 * @param {string} a reference string
 * @param {string} b string to compare with
 * @returns {number} negative if `a` has less; 0 if equal; positive if `a` has more
 */
function compareLength(a, b){
	const a_max = a.length;
	const b_max = b.length;
	// optimization for single characters
	if (a_max === b_max && a_max === 1)
		return 0;
	// with surrogate pairs, the number of codepoints is between [ceil(v.length/2), v.length]
	const b_min = (b_max+1) >> 1;
	// -1: a_max < b_min
	if (a_max < b_min)
		return -1;
	const a_min = (a_max+1) >> 1;
	// 1: a_min > b_max
	if (a_min > b_max)
		return 1;
	// can't determine, compute lengths
	return length(a) - length(b);
}
/** Pairwise iteration of the codepoints of two equal length strings
 * @memberof UnicodeHelpers
 * @param {string} a first string
 * @param {string} b second string
 * @yields {number[]} tuple of codepoints `[a, b]`
 */
function* pairwiseIterate(a, b){
	const a_len = a.length;
	let ai = 0, bi = 0;
	while (true){
		const an = a.codePointAt(ai);
		const bn = b.codePointAt(bi);
		yield [an, bn];
		ai += 1 + (an >= 0x10000);
		if (ai >= a_len)
			return;
		bi += 1 + (bn >= 0x10000);
	}
}
/** Get the last codepoint of a string
 * @memberof UnicodeHelpers
 * @param {string} str non-empty string to fetch the codepoint of
 * @returns {number} the codepoint
 */
function lastCodepoint(str){
	// UTF-16 (https://en.wikipedia.org/wiki/UTF-16) is self-synchronizing, so don't need to iterate full string;
	// check if surrogate pair first
	if (str.length > 1){
		const c = str.codePointAt(str.length-2);
		if (c >= 0x10000)
			return c;
	}
	// not a surrogate
	return str.codePointAt(str.length-1);
}

export {length, utf16Length, compareLength, pairwiseIterate, lastCodepoint};