/** Helpers for processing the raw unicode source text files, and other unicode algorithms
* @namespace unidata
*/
// Dataset: https://www.unicode.org/Public/UCD/latest/ucd/
import fsPromises from "fs/promises";
import { dirname } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
/** Normalize unicode property names for string comparison, as described in
* https://www.unicode.org/reports/tr44/#Matching_Rules
* @param {string} str string to normalize
* @returns {string} normalized string
* @memberof unidata
*/
function normalize(str){
let nstr = "";
let state = 0; // 1 = alphanumeric; 2 = pending hyphen
for (let i=0; i<str.length; i++){
let c = str.charAt(i);
if (/[a-zA-Z0-9]/.test(c)){
state = 1;
nstr += c;
}
else{
// non-medial hyphen okay
if (state == 2)
nstr += '-';
// 2: strip whitespace/underscore
if (!/[\s_]/.test(c)){
// 1: remove medial hyphens (between two alphanumeric chars)
if (c == '-' && state == 1){
state = 2;
continue;
}
nstr += c;
}
state = 0;
}
}
// non-medial hyphen okay
if (state == 2)
nstr += '-';
// 3: to lower case
nstr = nstr.toLowerCase();
// exception for "HANGUL JUNGSEONG O-E"
if (nstr === "hanguljungseongoe" && /O-E/i.test(str))
nstr = "hanguljungseongo-e"
return nstr;
}
/** Read a UNIDATA text file
* @yields lines from the file, each an array of normalized text columns
* @memberof unidata
*/
async function* reader(name){
const lines = (await fsPromises.readFile(__dirname+"/"+name)).toString().split("\n");
for (let line of lines){
// strip comments
line = line.replace(/#.*/g,"").trim();
// split and normalize
if (line)
yield line.split(';').map(normalize);
}
}
export { normalize, reader }