types_UnicodeNorm.mjs

import IntNormType from "./IntNorm.mjs";
import { create, copy } from "./helpers/common.mjs";
import { compareLength, lastCodepoint, utf16Length } from "./helpers/unicode.mjs";

/** This is the same as {@link UnicodeType}, but where the range bounds have been normalized to
 * always be inclusive. This can be easier to work with, and omits the extra logic needed to handle
 * exclusive bounds.
 * 
 * This uses {@link IntNormType} internally for operations on codepoints.
 * @implements {RangeType}
 */
const UnicodeNormType = {
	base_type: IntNormType,
	create,
	copy,
	toBaseTypeRange(range){
		return {
			start: lastCodepoint(range.start),
			end: lastCodepoint(range.end),
			startExcl: range.startExcl,
			endExcl: range.endExcl
		};
	},
	setStart(range, start, startExcl){
		const last = lastCodepoint(start);
		const prefix = start.slice(0, -utf16Length(last));
		this.base_type.setStart(range, last, startExcl);
		range.start = prefix+String.fromCodePoint(range.start);
		return range;
	},
	setEnd(range, end, endExcl){
		const last = lastCodepoint(end);
		const prefix = end.slice(0, -utf16Length(last));
		this.base_type.setEnd(range, last, endExcl);
		range.end = prefix+String.fromCodePoint(range.end);
		return range;
	},
	size(range){
		// delegate to base_type
		return this.base_type.size(this.toBaseTypeRange(range));
	},
	sample(range, i){
		const base = this.toBaseTypeRange(range);
		const base_len = utf16Length(base.start);
		const prefix = range.start.slice(0, -base_len);
		const s = this.base_type.sample(base, i);
		return prefix + String.fromCodePoint(s);
	},
	compare(mode, a, b, aExcl, bExcl){
		/* Can't sensibly measure distance for different string lengths or when there are
			differences in the non-last dimension. So we return infinity there, e.g. like they are
			separate number lines. This also means it will fallback to binary search in those cases
		*/
		let side = compareLength(a, b);
		// same lengths?
		degenerate: if (!side){
			// extract prefix; must be equal
			const a_last = lastCodepoint(a);
			const b_last = lastCodepoint(b);
			let a_prefix_len = a.length - utf16Length(a_last);
			let b_prefix_len = b.length - utf16Length(b_last);
			// quick check for inequality
			side = a_prefix_len - b_prefix_len;
			if (side)
				break degenerate;
			// need to do actual string comparison
			if (a_prefix_len){
				// this compares utf16 code units, rather than utf32; that's fine, since we are
				// just checking for degenerate comparisons anyways; it will be faster
				const a_prefix = a.substring(0, a_prefix_len);
				const b_prefix = b.substring(0, b_prefix_len);
				if (a_prefix !== b_prefix){
					side = a_prefix < b_prefix ? -1 : 1
					break degenerate;
				}
			}
			// okay to compare last code point;
			// delegate to base_type
			return this.base_type.compare(mode, a_last, b_last, aExcl, bExcl);
		}
		// degenerate case
		return {distance: side*Infinity, side};
	},
	*iterate(range, ...args){
		const start = lastCodepoint(range.start);
		const start_len = utf16Length(start);
		// delegate to base_type
		const gen = this.base_type.iterate({
			start,
			end: lastCodepoint(range.end),
			startExcl: range.startExcl,
			endExcl: range.endExcl
		}, ...args);
		if (range.start.length !== start_len){
			const prefix = range.start.slice(0, -start_len);
			for (const code of gen)
				yield (prefix+String.fromCodePoint(code));
			return;
		}
		// optimized for no prefix
		for (const code of gen)
			yield (String.fromCodePoint(code));
	},
	// check that start/end are same length
	validate(range){
		const res = this.compare(0b10, range.start, range.end, range.startExcl, range.endExcl);
		return isFinite(res.distance) && res.side <= 0;
	}
};

export default UnicodeNormType;