types_helpers_string_range.mjs

/** Helpers to facilitate *string range* usage with a {@link RangeGroup}. A string range defines a
 * range of strings, where each unicode codepoint can be changed individually. For example, the
 * range `["ab", "cd"]` encompasses the strings: `ab ac ad bb bc bd cb cc cd`. You can check out the
 * concept of a string range as [described in the unicode
 * specification](https://www.unicode.org/reports/tr35/tr35.html#String_Range)
 * 
 * String ranges are actually multidimensional, where each character index is another dimension.
 * {@link RangeGroup} cannot work with multidimensional ranges directly. However, we can still use
 * string ranges if we enumerate the range values in all but the last dimension. This unrolls the
 * range into many one dimensional ones. The example above would then become: `["ab","ad"]
 * ["bb","bd"] ["cb","cd"]`.
 * 
 * While for some cases this won't be acceptable, I believe it accomodates a majority of use cases
 * for string ranges. For example, a common case that will still work is encoding a range of unicode
 * grapheme clusters; e.g. emojis, like `["👦🏻", "👦🏿"]`.
 * 
 * The {@link StringRange.toRanges1d} method performs the transformation. You can use the
 * {@link StringRange.size} to check the number of strings or 1d ranges a string range has.
 * 
 * For the 1d string ranges that are output, both {@link UnicodeType} and
 * {@link UnicodeNormType} are available for use as the {@link RangeGroup#type}.
 * 
 * @namespace StringRange
 */
import { pairwiseIterate } from "./unicode.mjs";

/** Calculate the number of strings encompassed by the string range, or the number of 1d ranges
 * that would be generated by {@link StringRange.toRanges1d}
 * @memberof StringRange
 * @param {Range} range a string range
 * @param {boolean} [strings=false] if true, outputs the number of strings in the range; if false,
 *  the number of 1d ranges that would be generated by {@link StringRange.toRanges1d}
 * @returns {number}
 */
function size(range, strings=false){
	// 1d ranges; we just exclude the last dimension
	let size = 1, last;
	for (const [sn, en] of pairwiseIterate(range.start, range.end)){
		last = size;
		size *= en-sn+1;
	}
	// for 1d ranges, if singleton in last dimension, exclusive bounds eliminates a 1d range
	const exclude = strings || size === last && (range.startExcl || range.endExcl);
	if (!strings)
		size = last;
	if (exclude){
		if (range.startExcl) size--;
		if (range.endExcl && size) size--;	
	}
	return size;
}

/** Converts string ranges into a set of {@link Range} objects that will work with
 * {@link RangeGroup}. The {@link Range#start} and {@link Range#end} must have equal length (see
 * {@link UnicodeHelpers.length}). Additionally, all the codepoints of {@link Range#start} must be
 * less than or equal the codepoints of {@link Range#end}. Exclusive range bounds apply to the first
 * and last generated strings, not individual characters.
 * @memberof StringRange
 * @param {Range} range a string range
 * @yields {Range} enumerated 1d ranges
 */
function* toRanges1d(range){
	/* For multi-codepoint strings, we would iterate them like a nested for loop, where each
		codepoint is another loop. E.g. the range [ac, cd] would iterate a-to-c in outer loop,
		and c-to-d for inner loop;
	*/
	// starting/ending strings and exclusion
	const bounds = Array.from(pairwiseIterate(range.start, range.end));
	// current string
	const cur = [];
	// state of each index in `cur`: -1=no iteration needed, 0=iterating, 1=reached end
	const cur_state = [];
	// how many values in cur_state still need to reach end (e.g. state == 0)
	let still_pending = 0;

	const last_dim = bounds.length-1;
	const last_bounds = bounds[last_dim];
	// range we're emitting from build_range
	let emit = null;
	/** Build range from current state, where last dimension defines iteration
	 * @returns {boolean} true if a range could be built (stored in `emit`)
	 */
	function build_range(first=false, last=false){
		// last dimension defines the range
		cur[last_dim] = last_bounds[0];
		const start = String.fromCodePoint(...cur);
		cur[last_dim] = last_bounds[1];
		const end = String.fromCodePoint(...cur);
		// calculate exclusion
		const startExcl = first && range.startExcl;
		const endExcl = last && range.endExcl;
		// empty range?
		if ((startExcl || endExcl) && last_bounds[0] === last_bounds[1])
			return false;
		emit = {start, end};
		if (startExcl)
			emit.startExcl = true;
		if (endExcl)
			emit.endExcl = true;
		return true;
	}

	// initialize current state;
	// exclude last dimension
	for (let i=0; i<last_dim; i++){
		const b = bounds[i];
		const s = b[0];
		const e = b[1];
		const single = s === e;
		still_pending += single^1;
		cur_state.push(-single);
		cur.push(s);
	}

	if (build_range(true, !still_pending))
		yield emit;
	// only last dimension changes
	if (!still_pending)
		return;
	
	while (true){
		// exclude last dimension
		let dim = cur.length-2;
		while (true){
			let state = cur_state[dim];
			// next value
			if (!state){
				if (++cur[dim] === bounds[dim][1]){
					cur_state[dim] = 1;
					// last string?
					if (!--still_pending){
						if (build_range(false, true))
							yield emit;
						return;
					}
				}
				if (build_range())
					yield emit;
				break;
			}
			// reset dimension
			if (state > 0){
				cur_state[dim] = 0;
				cur[dim] = bounds[dim][0];
				still_pending++;
			}
			// ignore dims where state < 0
			// with still_pending counter, dim will never be zero
			dim--;
		}
	}
}

export {size, toRanges1d};