types_helpers_string_range.mjs
/** Helpers to facilitate *string range* usage with a {@link RangeGroup}. A string range defines a
* range of strings, where each unicode codepoint can be changed individually. For example, the
* range `["ab", "cd"]` encompasses the strings: `ab ac ad bb bc bd cb cc cd`. You can check out the
* concept of a string range as [described in the unicode
* specification](https://www.unicode.org/reports/tr35/tr35.html#String_Range)
*
* String ranges are actually multidimensional, where each character index is another dimension.
* {@link RangeGroup} cannot work with multidimensional ranges directly. However, we can still use
* string ranges if we enumerate the range values in all but the last dimension. This unrolls the
* range into many one dimensional ones. The example above would then become: `["ab","ad"]
* ["bb","bd"] ["cb","cd"]`.
*
* While for some cases this won't be acceptable, I believe it accomodates a majority of use cases
* for string ranges. For example, a common case that will still work is encoding a range of unicode
* grapheme clusters; e.g. emojis, like `["👦🏻", "👦🏿"]`.
*
* The {@link StringRange.toRanges1d} method performs the transformation. You can use the
* {@link StringRange.size} to check the number of strings or 1d ranges a string range has.
*
* For the 1d string ranges that are output, both {@link UnicodeType} and
* {@link UnicodeNormType} are available for use as the {@link RangeGroup#type}.
*
* @namespace StringRange
*/
import { pairwiseIterate } from "./unicode.mjs";
/** Calculate the number of strings encompassed by the string range, or the number of 1d ranges
* that would be generated by {@link StringRange.toRanges1d}
* @memberof StringRange
* @param {Range} range a string range
* @param {boolean} [strings=false] if true, outputs the number of strings in the range; if false,
* the number of 1d ranges that would be generated by {@link StringRange.toRanges1d}
* @returns {number}
*/
function size(range, strings=false){
// 1d ranges; we just exclude the last dimension
let size = 1, last;
for (const [sn, en] of pairwiseIterate(range.start, range.end)){
last = size;
size *= en-sn+1;
}
// for 1d ranges, if singleton in last dimension, exclusive bounds eliminates a 1d range
const exclude = strings || size === last && (range.startExcl || range.endExcl);
if (!strings)
size = last;
if (exclude){
if (range.startExcl) size--;
if (range.endExcl && size) size--;
}
return size;
}
/** Converts string ranges into a set of {@link Range} objects that will work with
* {@link RangeGroup}. The {@link Range#start} and {@link Range#end} must have equal length (see
* {@link UnicodeHelpers.length}). Additionally, all the codepoints of {@link Range#start} must be
* less than or equal the codepoints of {@link Range#end}. Exclusive range bounds apply to the first
* and last generated strings, not individual characters.
* @memberof StringRange
* @param {Range} range a string range
* @yields {Range} enumerated 1d ranges
*/
function* toRanges1d(range){
/* For multi-codepoint strings, we would iterate them like a nested for loop, where each
codepoint is another loop. E.g. the range [ac, cd] would iterate a-to-c in outer loop,
and c-to-d for inner loop;
*/
// starting/ending strings and exclusion
const bounds = Array.from(pairwiseIterate(range.start, range.end));
// current string
const cur = [];
// state of each index in `cur`: -1=no iteration needed, 0=iterating, 1=reached end
const cur_state = [];
// how many values in cur_state still need to reach end (e.g. state == 0)
let still_pending = 0;
const last_dim = bounds.length-1;
const last_bounds = bounds[last_dim];
// range we're emitting from build_range
let emit = null;
/** Build range from current state, where last dimension defines iteration
* @returns {boolean} true if a range could be built (stored in `emit`)
*/
function build_range(first=false, last=false){
// last dimension defines the range
cur[last_dim] = last_bounds[0];
const start = String.fromCodePoint(...cur);
cur[last_dim] = last_bounds[1];
const end = String.fromCodePoint(...cur);
// calculate exclusion
const startExcl = first && range.startExcl;
const endExcl = last && range.endExcl;
// empty range?
if ((startExcl || endExcl) && last_bounds[0] === last_bounds[1])
return false;
emit = {start, end};
if (startExcl)
emit.startExcl = true;
if (endExcl)
emit.endExcl = true;
return true;
}
// initialize current state;
// exclude last dimension
for (let i=0; i<last_dim; i++){
const b = bounds[i];
const s = b[0];
const e = b[1];
const single = s === e;
still_pending += single^1;
cur_state.push(-single);
cur.push(s);
}
if (build_range(true, !still_pending))
yield emit;
// only last dimension changes
if (!still_pending)
return;
while (true){
// exclude last dimension
let dim = cur.length-2;
while (true){
let state = cur_state[dim];
// next value
if (!state){
if (++cur[dim] === bounds[dim][1]){
cur_state[dim] = 1;
// last string?
if (!--still_pending){
if (build_range(false, true))
yield emit;
return;
}
}
if (build_range())
yield emit;
break;
}
// reset dimension
if (state > 0){
cur_state[dim] = 0;
cur[dim] = bounds[dim][0];
still_pending++;
}
// ignore dims where state < 0
// with still_pending counter, dim will never be zero
dim--;
}
}
}
export {size, toRanges1d};