summaryrefslogtreecommitdiffstats
path: root/text_processing
diff options
context:
space:
mode:
authorManos Pitsidianakis <el13635@mail.ntua.gr>2019-07-22 15:14:39 +0300
committerManos Pitsidianakis <el13635@mail.ntua.gr>2019-07-27 01:41:04 +0300
commitd84ceca88e1882a8db3c7630633a3f840f4593d6 (patch)
treee6a70be2491dd69ae6e6ff1c1c9496c4c2d5e61b /text_processing
parentff37e97cbad1d95235ded0631973cc83cb2515fc (diff)
create text_processing crate
Diffstat (limited to 'text_processing')
-rw-r--r--text_processing/Cargo.toml9
-rw-r--r--text_processing/src/grapheme_clusters.rs1845
-rw-r--r--text_processing/src/lib.rs4
-rw-r--r--text_processing/src/wcwidth.rs661
4 files changed, 2519 insertions, 0 deletions
diff --git a/text_processing/Cargo.toml b/text_processing/Cargo.toml
new file mode 100644
index 00000000..4b6b1c02
--- /dev/null
+++ b/text_processing/Cargo.toml
@@ -0,0 +1,9 @@
+[package]
+name = "text_processing"
+version = "0.0.1" #:version
+authors = ["Manos Pitsidianakis <el13635@mail.ntua.gr>"]
+workspace = ".."
+edition = "2018"
+
+[dependencies]
+unicode-segmentation = "1.2.1"
diff --git a/text_processing/src/grapheme_clusters.rs b/text_processing/src/grapheme_clusters.rs
new file mode 100644
index 00000000..b1f825bf
--- /dev/null
+++ b/text_processing/src/grapheme_clusters.rs
@@ -0,0 +1,1845 @@
+/*
+ Breaks a string into individual user-perceived "characters"
+ Unicode UAX-29 standard, version 10.0.0
+
+ Usage:
+ //returns an array of strings, one string for each grapheme cluster
+ let mut graphemes = split_graphemes(string);
+
+*/
+
+use crate::wcwidth::{wcwidth, CodePointsIter};
+extern crate unicode_segmentation;
+use self::unicode_segmentation::UnicodeSegmentation;
+
+pub trait Graphemes: UnicodeSegmentation + CodePointsIter {
+ fn split_graphemes<'a>(&'a self) -> Vec<&'a str> {
+ UnicodeSegmentation::graphemes(self, true).collect::<Vec<&str>>()
+ }
+
+ fn graphemes_indices<'a>(&'a self) -> Vec<(usize, &'a str)> {
+ UnicodeSegmentation::grapheme_indices(self, true).collect::<Vec<(usize, &str)>>()
+ }
+
+ fn next_grapheme(&self) -> Option<(usize, &str)> {
+ UnicodeSegmentation::grapheme_indices(self, true).next()
+ }
+
+ fn last_grapheme(&self) -> Option<(usize, &str)> {
+ UnicodeSegmentation::grapheme_indices(self, true).next_back()
+ }
+
+ fn grapheme_width(&self) -> usize {
+ let mut count = 0;
+ for c in self.code_points() {
+ count += if let Some(w) = wcwidth(c) { w } else { 0 };
+ }
+
+ count
+ }
+
+ fn grapheme_len(&self) -> usize {
+ self.split_graphemes().len()
+ }
+}
+
+impl Graphemes for str {}
+
+pub struct WordBreakIter<'s> {
+ input: &'s str,
+ width: usize,
+}
+impl<'s> Iterator for WordBreakIter<'s> {
+ type Item = &'s str;
+
+ fn next(&mut self) -> Option<&'s str> {
+ if self.input.is_empty() {
+ return None;
+ }
+ self.input = self.input.trim_start_matches(|c| c == ' ');
+ if self.input.starts_with('\n') {
+ let ret = &self.input[0..0];
+ self.input = &self.input[1..];
+ return Some(ret);
+ }
+ if let Some(next_idx) = self.input.as_bytes().iter().position(|&c| c == b'\n') {
+ if next_idx <= self.width {
+ let ret = &self.input[..next_idx];
+ self.input = &self.input[next_idx + 1..];
+ return Some(ret);
+ }
+ }
+ let graphemes = UnicodeSegmentation::grapheme_indices(self.input, true)
+ .take(self.width)
+ .collect::<Vec<(usize, &str)>>();
+ if graphemes.len() == self.width {
+ // use grapheme indices and find position of " " graphemes
+ if let Some(next_idx) = graphemes.iter().rposition(|(_, g)| *g == " ") {
+ let next_idx = graphemes[next_idx].0;
+ let ret = &self.input[..next_idx];
+ self.input = &self.input[next_idx..];
+ Some(ret)
+ } else {
+ let last = graphemes.last().unwrap();
+ let next_idx = last.0 + last.1.len();
+ let ret = &self.input[..next_idx];
+ self.input = &self.input[next_idx..];
+ Some(ret)
+ }
+ } else {
+ /* graphemes.len() < width */
+ let ret = self.input;
+ self.input = &self.input[0..0];
+ Some(ret)
+ }
+ }
+}
+
+pub fn word_break_string(s: &str, width: usize) -> Vec<&str> {
+ let iter = WordBreakIter { input: s, width };
+ iter.collect()
+}
+
+//#[derive(PartialEq)]
+//enum Property {
+// CR,
+// LF,
+// Control,
+// Extend,
+// Regional_Indicator,
+// SpacingMark,
+// L,
+// V,
+// T,
+// LV,
+// LVT,
+// Other,
+// Prepend,
+// E_Base,
+// E_Modifier,
+// ZWJ,
+// Glue_After_Zwj,
+// E_Base_GAZ,
+//}
+//
+//enum Breaks {
+// NotBreak,
+// BreakStart,
+// Break,
+// BreakLastRegional,
+// BreakPenultimateRegional,
+//}
+//
+//use Property::*;
+//use Breaks::*;
+//
+//impl From<u8> for Breaks {
+// fn from(u: u8) -> Breaks {
+// match u {
+// 0 => NotBreak,
+// 1 => BreakStart,
+// 2 => Break,
+// 3 => BreakLastRegional,
+// 4 => BreakPenultimateRegional,
+// _ => unreachable!()
+// }
+// }
+//}
+//
+//fn is_surrogate(s: &str, pos: usize) -> bool {
+// return 0xd800 <= char_code_at(s, pos) && char_code_at(s, pos) <= 0xdbff &&
+// 0xdc00 <= char_code_at(s, pos + 1) && char_code_at(s, pos + 1) <= 0xdfff;
+//}
+//
+//// Private function, gets a Unicode code point from a java_script UTF-16 string
+//// handling surrogate pairs appropriately
+//fn code_point_at(s: &str, idx: usize) -> u8 {
+// let mut code: u8 = char_code_at(s, idx);
+//
+// // if a high surrogate
+// if (0x_d800 <= code && code <= 0x_dBFF &&
+// idx < str.length - 1){
+// let mut hi = code;
+// let mut low = char_code_at(s, idx + 1);
+// if (0x_dC00 <= low && low <= 0x_dFFF){
+// return ((hi - 0x_d800) * 0x400) + (low - 0x_dC00) + 0x10000;
+// }
+// return hi;
+// }
+//
+// // if a low surrogate
+// if (0x_dC00 <= code && code <= 0x_dFFF &&
+// idx >= 1){
+// let mut hi = char_code_at(s, idx - 1);
+// let mut low = code;
+// if (0x_d800 <= hi && hi <= 0x_dBFF){
+// return ((hi - 0x_d800) * 0x400) + (low - 0x_dC00) + 0x10000;
+// }
+// return low;
+// }
+//
+// //just return the char if an unmatched surrogate half or a
+// //single-char codepoint
+// return code;
+//}
+//
+//// Private function, returns whether a break is allowed between the
+//// two given grapheme breaking classes
+//fn should_break(start, mid, end) -> Breaks {
+// let mut all = [start, mid, end].into();
+// let mut previous = start;
+// let mut next = end
+//
+// // Lookahead termintor for:
+// // GB10. (E_Base | EBG) Extend* ? E_Modifier
+// let mut e_modifier_index = all.last_index_of(E_Modifier)
+// if(e_modifier_index > 1 &&
+// all.slice(1, e_modifier_index).every(function(c){return c == Extend}) &&
+// [Extend, E_Base, E_Base_GAZ].index_of(start) == -1){
+// return Break
+// }
+//
+// // Lookahead termintor for:
+// // GB12. ^ (RI RI)* RI ? RI
+// // GB13. [^RI] (RI RI)* RI ? RI
+// let mut r_iIndex = all.last_index_of(Regional_Indicator)
+// if(r_iIndex > 0 &&
+// all.slice(1, r_iIndex).every(function(c){return c == Regional_Indicator}) &&
+// [Prepend, Regional_Indicator].index_of(previous) == -1) {
+// if(all.filter(function(c){return c == Regional_Indicator}).length % 2 == 1) {
+// return BreakLastRegional
+// }
+// else {
+// return BreakPenultimateRegional
+// }
+// }
+//
+// // GB3. CR X LF
+// if(previous == CR && next == LF){
+// return NotBreak;
+// }
+// // GB4. (Control|CR|LF) ÷
+// else if(previous == Control || previous == CR || previous == LF){
+// if(next == E_Modifier && mid.every(function(c){return c == Extend})){
+// return Break
+// }
+// else {
+// return BreakStart
+// }
+// }
+// // GB5. ÷ (Control|CR|LF)
+// else if(next == Control || next == CR || next == LF){
+// return BreakStart;
+// }
+// // GB6. L X (L|V|LV|LVT)
+// else if(previous == L &&
+// (next == L || next == V || next == LV || next == LVT)){
+// return NotBreak;
+// }
+// // GB7. (LV|V) X (V|T)
+// else if((previous == LV || previous == V) &&
+// (next == V || next == T)){
+// return NotBreak;
+// }
+// // GB8. (LVT|T) X (T)
+// else if((previous == LVT || previous == T) &&
+// next == T){
+// return NotBreak;
+// }
+// // GB9. X (Extend|ZWJ)
+// else if (next == Extend || next == ZWJ){
+// return NotBreak;
+// }
+// // GB9a. X SpacingMark
+// else if(next == SpacingMark){
+// return NotBreak;
+// }
+// // GB9b. Prepend X
+// else if (previous == Prepend){
+// return NotBreak;
+// }
+//
+// // GB10. (E_Base | EBG) Extend* ? E_Modifier
+// let mut previous_non_extend_index = all.index_of(Extend) != -1 ? all.last_index_of(Extend) - 1 : all.length - 2;
+// if([E_Base, E_Base_GAZ].index_of(all[previous_non_extend_index]) != -1 &&
+// all.slice(previous_non_extend_index + 1, -1).every(function(c){return c == Extend}) &&
+// next == E_Modifier){
+// return NotBreak;
+// }
+//
+// // GB11. ZWJ ? (Glue_After_Zwj | EBG)
+// if(previous == ZWJ && [Glue_After_Zwj, E_Base_GAZ].index_of(next) != -1) {
+// return NotBreak;
+// }
+//
+// // GB12. ^ (RI RI)* RI ? RI
+// // GB13. [^RI] (RI RI)* RI ? RI
+// if(mid.index_of(Regional_Indicator) != -1) {
+// return Break;
+// }
+// if(previous == Regional_Indicator && next == Regional_Indicator) {
+// return NotBreak;
+// }
+//
+// // GB999. Any ? Any
+// return BreakStart;
+//}
+//
+//// Returns the next grapheme break in the string after the given index
+//fn next_break(s: &str, index: usize) -> Breaks {
+// // if(index < 0){
+// // return 0;
+// // }
+// if(index >= s.len() - 1){
+// return s.len().into();
+// }
+// let mut prev = get_grapheme_break_property(code_point_at(s, index));
+// let mut mid = []
+// for (let mut i = index + 1; i < s.len(); i++) {
+// // check for already processed low surrogates
+// if(is_surrogate(string, i - 1)){
+// continue;
+// }
+//
+// let mut next = get_grapheme_break_property(code_point_at(s, i));
+// if(should_break(prev, mid, next)){
+// return i.into();
+// }
+//
+// mid.push(next);
+// }
+// return s.len().into();
+//};
+//
+//// Breaks the given string into an array of grapheme cluster strings
+//fn split_graphemes(s: &str) -> Vec<&str> {
+// let mut res = Vec::new()
+// let mut index = 0;
+//
+// while let Some(brk) = next_break(s, index) {
+// res.push(&s[index..brk]);
+// index = brk;
+// }
+//
+// if(index < s.len()){
+// res.push(&s[index..]);
+// }
+//
+// return res;
+//};
+//
+//// Returns the iterator of grapheme clusters there are in the given string
+////fn iterate_graphemes(s: &str) {
+//// let mut index = 0;
+//// let mut res = {
+//// next: (function() {
+//// let mut value;
+//// let mut brk;
+//// if ((brk = this.next_break(str, index)) < str.length) {
+//// value = str.slice(index, brk);
+//// index = brk;
+//// return { value: value, done: false };
+//// }
+//// if (index < str.length) {
+//// value = str.slice(index);
+//// index = str.length;
+//// return { value: value, done: false };
+//// }
+//// return { value: undefined, done: true };
+//// }).bind(this)
+//// };
+//// // ES2015 @@iterator method (iterable) for spread syntax and for...of statement
+//// if (typeof Symbol !== 'undefined' && Symbol.iterator) {
+//// res[Symbol.iterator] = function() {return res};
+//// }
+//// return res;
+////};
+//
+//// Returns the number of grapheme clusters there are in the given string
+//fn count_graphemes(s:&str) -> usize {
+// let mut count = 0;
+// let mut index = 0;
+// while let Some(brk) = next_break(s, index) {
+// index = brk;
+// count++;
+// }
+//
+// if(index < s.len()){
+// count++;
+// }
+//
+// return count;
+//};
+//
+////given a Unicode code point, determines this symbol's grapheme break property
+//fn get_grapheme_break_property(code: u8) -> Property {
+//
+// //grapheme break property for Unicode 10.0.0,
+// //taken from http://www.unicode.org/Public/10.0.0/ucd/auxiliary/grapheme_break_property.txt
+// //and adapted to java_script rules
+//
+// if(
+// (0x0600 <= code && code <= 0x0605) || // Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE
+// 0x06DD == code || // Cf ARABIC END OF AYAH
+// 0x070F == code || // Cf SYRIAC ABBREVIATION MARK
+// 0x08E2 == code || // Cf ARABIC DISPUTED END OF AYAH
+// 0x0D4E == code || // Lo MALAYALAM LETTER DOT REPH
+// 0x110BD == code || // Cf KAITHI NUMBER SIGN
+// (0x111C2 <= code && code <= 0x111C3) || // Lo [2] SHARADA SIGN JIHVAMULIYA..SHARADA SIGN UPADHMANIYA
+// 0x11A3A == code || // Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
+// (0x11A86 <= code && code <= 0x11A89) || // Lo [4] SOYOMBO CLUSTER-INITIAL LETTER RA..SOYOMBO CLUSTER-INITIAL LETTER SA
+// 0x11D46 == code // Lo MASARAM GONDI REPHA
+// ){
+// return Prepend;
+// }
+// if(
+// 0x000D == code // Cc <control-000D>
+// ){
+// return CR;
+// }
+//
+// if(
+// 0x000A == code // Cc <control-000A>
+// ){
+// return LF;
+// }
+//
+//
+// if(
+// (0x0000 <= code && code <= 0x0009) || // Cc [10] <control-0000>..<control-0009>
+// (0x000B <= code && code <= 0x000C) || // Cc [2] <control-000B>..<control-000C>
+// (0x000E <= code && code <= 0x001F) || // Cc [18] <control-000E>..<control-001F>
+// (0x007F <= code && code <= 0x009F) || // Cc [33] <control-007F>..<control-009F>
+// 0x00AD == code || // Cf SOFT HYPHEN
+// 0x061C == code || // Cf ARABIC LETTER MARK
+//
+// 0x180E == code || // Cf MONGOLIAN VOWEL SEPARATOR
+// 0x200B == code || // Cf ZERO WIDTH SPACE
+// (0x200E <= code && code <= 0x200F) || // Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK
+// 0x2028 == code || // Zl LINE SEPARATOR
+// 0x2029 == code || // Zp PARAGRAPH SEPARATOR
+// (0x202A <= code && code <= 0x202E) || // Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE
+// (0x2060 <= code && code <= 0x2064) || // Cf [5] WORD JOINER..INVISIBLE PLUS
+// 0x2065 == code || // Cn <reserved-2065>
+// (0x2066 <= code && code <= 0x206F) || // Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES
+// (0x_d800 <= code && code <= 0x_dFFF) || // Cs [2048] <surrogate-D800>..<surrogate-DFFF>
+// 0x_fEFF == code || // Cf ZERO WIDTH NO-BREAK SPACE
+// (0x_fFF0 <= code && code <= 0x_fFF8) || // Cn [9] <reserved-FFF0>..<reserved-FFF8>
+// (0x_fFF9 <= code && code <= 0x_fFFB) || // Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
+// (0x1BCA0 <= code && code <= 0x1BCA3) || // Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
+// (0x1D173 <= code && code <= 0x1D17A) || // Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
+// 0x_e0000 == code || // Cn <reserved-E0000>
+// 0x_e0001 == code || // Cf LANGUAGE TAG
+// (0x_e0002 <= code && code <= 0x_e001F) || // Cn [30] <reserved-E0002>..<reserved-E001F>
+// (0x_e0080 <= code && code <= 0x_e00FF) || // Cn [128] <reserved-E0080>..<reserved-E00FF>
+// (0x_e01F0 <= code && code <= 0x_e0FFF) // Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
+// ){
+// return Control;
+// }
+//
+//
+// if(
+// (0x0300 <= code && code <= 0x036F) || // Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X
+// (0x0483 <= code && code <= 0x0487) || // Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE
+// (0x0488 <= code && code <= 0x0489) || // Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
+// (0x0591 <= code && code <= 0x05BD) || // Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
+// 0x05BF == code || // Mn HEBREW POINT RAFE
+// (0x05C1 <= code && code <= 0x05C2) || // Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT
+// (0x05C4 <= code && code <= 0x05C5) || // Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
+// 0x05C7 == code || // Mn HEBREW POINT QAMATS QATAN
+// (0x0610 <= code && code <= 0x061A) || // Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA
+// (0x064B <= code && code <= 0x065F) || // Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW
+// 0x0670 == code || // Mn ARABIC LETTER SUPERSCRIPT ALEF
+// (0x06D6 <= code && code <= 0x06DC) || // Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
+// (0x06DF <= code && code <= 0x06E4) || // Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
+// (0x06E7 <= code && code <= 0x06E8) || // Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
+// (0x06EA <= code && code <= 0x06ED) || // Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
+// 0x0711 == code || // Mn SYRIAC LETTER SUPERSCRIPT ALAPH
+// (0x0730 <= code && code <= 0x074A) || // Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
+// (0x07A6 <= code && code <= 0x07B0) || // Mn [11] THAANA ABAFILI..THAANA SUKUN
+// (0x07EB <= code && code <= 0x07F3) || // Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
+// (0x0816 <= code && code <= 0x0819) || // Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH
+// (0x081B <= code && code <= 0x0823) || // Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A
+// (0x0825 <= code && code <= 0x0827) || // Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U
+// (0x0829 <= code && code <= 0x082D) || // Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA
+// (0x0859 <= code && code <= 0x085B) || // Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK
+// (0x08D4 <= code && code <= 0x08E1) || // Mn [14] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH SIGN SAFHA
+// (0x08E3 <= code && code <= 0x0902) || // Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA
+// 0x093A == code || // Mn DEVANAGARI VOWEL SIGN OE
+// 0x093C == code || // Mn DEVANAGARI SIGN NUKTA
+// (0x0941 <= code && code <= 0x0948) || // Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
+// 0x094D == code || // Mn DEVANAGARI SIGN VIRAMA
+// (0x0951 <= code && code <= 0x0957) || // Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE
+// (0x0962 <= code && code <= 0x0963) || // Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
+// 0x0981 == code || // Mn BENGALI SIGN CANDRABINDU
+// 0x09BC == code || // Mn BENGALI SIGN NUKTA
+// 0x09BE == code || // Mc BENGALI VOWEL SIGN AA
+// (0x09C1 <= code && code <= 0x09C4) || // Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR
+// 0x09CD == code || // Mn BENGALI SIGN VIRAMA
+// 0x09D7 == code || // Mc BENGALI AU LENGTH MARK
+// (0x09E2 <= code && code <= 0x09E3) || // Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL
+// (0x0A01 <= code && code <= 0x0A02) || // Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI
+// 0x0A3C == code || // Mn GURMUKHI SIGN NUKTA
+// (0x0A41 <= code && code <= 0x0A42) || // Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU
+// (0x0A47 <= code && code <= 0x0A48) || // Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI
+// (0x0A4B <= code && code <= 0x0A4D) || // Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
+// 0x0A51 == code || // Mn GURMUKHI SIGN UDAAT
+// (0x0A70 <= code && code <= 0x0A71) || // Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK
+// 0x0A75 == code || // Mn GURMUKHI SIGN YAKASH
+// (0x0A81 <= code && code <= 0x0A82) || // Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA
+// 0x0ABC == code || // Mn GUJARATI SIGN NUKTA
+// (0x0AC1 <= code && code <= 0x0AC5) || // Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E
+// (0x0AC7 <= code && code <= 0x0AC8) || // Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI
+// 0x0ACD == code || // Mn GUJARATI SIGN VIRAMA
+// (0x0AE2 <= code && code <= 0x0AE3) || // Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
+// (0x0AFA <= code && code <= 0x0AFF) || // Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE
+// 0x0B01 == code || // Mn ORIYA SIGN CANDRABINDU
+// 0x0B3C == code || // Mn ORIYA SIGN NUKTA
+// 0x0B3E == code || // Mc ORIYA VOWEL SIGN AA
+// 0x0B3F == code || // Mn ORIYA VOWEL SIGN I
+// (0x0B41 <= code && code <= 0x0B44) || // Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR
+// 0x0B4D == code || // Mn ORIYA SIGN VIRAMA
+// 0x0B56 == code || // Mn ORIYA AI LENGTH MARK
+// 0x0B57 == code || // Mc ORIYA AU LENGTH MARK
+// (0x0B62 <= code && code <= 0x0B63) || // Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL
+// 0x0B82 == code || // Mn TAMIL SIGN ANUSVARA
+// 0x0BBE == code || // Mc TAMIL VOWEL SIGN AA
+// 0x0BC0 == code || // Mn TAMIL VOWEL SIGN II
+// 0x0BCD == code || // Mn TAMIL SIGN VIRAMA
+// 0x0BD7 == code || // Mc TAMIL AU LENGTH MARK
+// 0x0C00 == code || // Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE
+// (0x0C3E <= code && code <= 0x0C40) || // Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
+// (0x0C46 <= code && code <= 0x0C48) || // Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
+// (0x0C4A <= code && code <= 0x0C4D) || // Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
+// (0x0C55 <= code && code <= 0x0C56) || // Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
+// (0x0C62 <= code && code <= 0x0C63) || // Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL
+// 0x0C81 == code || // Mn KANNADA SIGN CANDRABINDU
+// 0x0CBC == code || // Mn KANNADA SIGN NUKTA
+// 0x0CBF == code || // Mn KANNADA VOWEL SIGN I
+// 0x0CC2 == code || // Mc KANNADA VOWEL SIGN UU
+// 0x0CC6 == code || // Mn KANNADA VOWEL SIGN E
+// (0x0CCC <= code && code <= 0x0CCD) || // Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
+// (0x0CD5 <= code && code <= 0x0CD6) || // Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
+// (0x0CE2 <= code && code <= 0x0CE3) || // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
+// (0x0D00 <= code && code <= 0x0D01) || // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
+// (0x0D3B <= code && code <= 0x0D3C) || // Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
+// 0x0D3E == code || // Mc MALAYALAM VOWEL SIGN AA
+// (0x0D41 <= code && code <= 0x0D44) || // Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR
+// 0x0D4D == code || // Mn MALAYALAM SIGN VIRAMA
+// 0x0D57 == code || // Mc MALAYALAM AU LENGTH MARK
+// (0x0D62 <= code && code <= 0x0D63) || // Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL
+// 0x0DCA == code || // Mn SINHALA SIGN AL-LAKUNA
+// 0x0DCF == code || // Mc SINHALA VOWEL SIGN AELA-PILLA
+// (0x0DD2 <= code && code <= 0x0DD4) || // Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
+// 0x0DD6 == code || // Mn SINHALA VOWEL SIGN DIGA PAA-PILLA
+// 0x0DDF == code || // Mc SINHALA VOWEL SIGN GAYANUKITTA
+// 0x0E31 == code || // Mn THAI CHARACTER MAI HAN-AKAT
+// (0x0E34 <= code && code <= 0x0E3A) || // Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU
+// (0x0E47 <= code && code <= 0x0E4E) || // Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
+// 0x0EB1 == code || // Mn LAO VOWEL SIGN MAI KAN
+// (0x0EB4 <= code && code <= 0x0EB9) || // Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU
+// (0x0EBB <= code && code <= 0x0EBC) || // Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO
+// (0x0EC8 <= code && code <= 0x0ECD) || // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
+// (0x0F18 <= code && code <= 0x0F19) || // Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
+// 0x0F35 == code || // Mn TIBETAN MARK NGAS BZUNG NYI ZLA
+// 0x0F37 == code || // Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
+// 0x0F39 == code || // Mn TIBETAN MARK TSA -PHRU
+// (0x0F71 <= code && code <= 0x0F7E) || // Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
+// (0x0F80 <= code && code <= 0x0F84) || // Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
+// (0x0F86 <= code && code <= 0x0F87) || // Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
+// (0x0F8D <= code && code <= 0x0F97) || // Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
+// (0x0F99 <= code && code <= 0x0FBC) || // Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
+// 0x0FC6 == code || // Mn TIBETAN SYMBOL PADMA GDAN
+// (0x102D <= code && code <= 0x1030) || // Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
+// (0x1032 <= code && code <= 0x1037) || // Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW
+// (0x1039 <= code && code <= 0x103A) || // Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT
+// (0x103D <= code && code <= 0x103E) || // Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA
+// (0x1058 <= code && code <= 0x1059) || // Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL
+// (0x105E <= code && code <= 0x1060) || // Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA
+// (0x1071 <= code && code <= 0x1074) || // Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE
+// 0x1082 == code || // Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA
+// (0x1085 <= code && code <= 0x1086) || // Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y
+// 0x108D == code || // Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE
+// 0x109D == code || // Mn MYANMAR VOWEL SIGN AITON AI
+// (0x135D <= code && code <= 0x135F) || // Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK
+// (0x1712 <= code && code <= 0x1714) || // Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
+// (0x1732 <= code && code <= 0x1734) || // Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
+// (0x1752 <= code && code <= 0x1753) || // Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
+// (0x1772 <= code && code <= 0x1773) || // Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
+// (0x17B4 <= code && code <= 0x17B5) || // Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA
+// (0x17B7 <= code && code <= 0x17BD) || // Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA
+// 0x17C6 == code || // Mn KHMER SIGN NIKAHIT
+// (0x17C9 <= code && code <= 0x17D3) || // Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
+// 0x17DD == code || // Mn KHMER SIGN ATTHACAN
+// (0x180B <= code && code <= 0x180D) || // Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
+// (0x1885 <= code && code <= 0x1886) || // Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
+// 0x18A9 == code || // Mn MONGOLIAN LETTER ALI GALI DAGALGA
+// (0x1920 <= code && code <= 0x1922) || // Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
+// (0x1927 <= code && code <= 0x1928) || // Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
+// 0x1932 == code || // Mn LIMBU SMALL LETTER ANUSVARA
+// (0x1939 <= code && code <= 0x193B) || // Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
+// (0x1A17 <= code && code <= 0x1A18) || // Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
+// 0x1A1B == code || // Mn BUGINESE VOWEL SIGN AE
+// 0x1A56 == code || // Mn TAI THAM CONSONANT SIGN MEDIAL LA
+// (0x1A58 <= code && code <= 0x1A5E) || // Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA
+// 0x1A60 == code || // Mn TAI THAM SIGN SAKOT
+// 0x1A62 == code || // Mn TAI THAM VOWEL SIGN MAI SAT
+// (0x1A65 <= code && code <= 0x1A6C) || // Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW
+// (0x1A73 <= code && code <= 0x1A7C) || // Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN
+// 0x1A7F == code || // Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT
+// (0x1AB0 <= code && code <= 0x1ABD) || // Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW
+// 0x1ABE == code || // Me COMBINING PARENTHESES OVERLAY
+// (0x1B00 <= code && code <= 0x1B03) || // Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
+// 0x1B34 == code || // Mn BALINESE SIGN REREKAN
+// (0x1B36 <= code && code <= 0x1B3A) || // Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
+// 0x1B3C == code || // Mn BALINESE VOWEL SIGN LA LENGA
+// 0x1B42 == code || // Mn BALINESE VOWEL SIGN PEPET
+// (0x1B6B <= code && code <= 0x1B73) || // Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
+// (0x1B80 <= code && code <= 0x1B81) || // Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR
+// (0x1BA2 <= code && code <= 0x1BA5) || // Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU
+// (0x1BA8 <= code && code <= 0x1BA9) || // Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG
+// (0x1BAB <= code && code <= 0x1BAD) || // Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA
+// 0x1BE6 == code || // Mn BATAK SIGN TOMPI
+// (0x1BE8 <= code && code <= 0x1BE9) || // Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE
+// 0x1BED == code || // Mn BATAK VOWEL SIGN KARO O
+// (0x1BEF <= code && code <= 0x1BF1) || // Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H
+// (0x1C2C <= code && code <= 0x1C33) || // Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T
+// (0x1C36 <= code && code <= 0x1C37) || // Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA
+// (0x1CD0 <= code && code <= 0x1CD2) || // Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA
+// (0x1CD4 <= code && code <= 0x1CE0) || // Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
+// (0x1CE2 <= code && code <= 0x1CE8) || // Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
+// 0x1CED == code || // Mn VEDIC SIGN TIRYAK
+// 0x1CF4 == code || // Mn VEDIC TONE CANDRA ABOVE
+// (0x1CF8 <= code && code <= 0x1CF9) || // Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
+// (0x1DC0 <= code && code <= 0x1DF9) || // Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW
+// (0x1DFB <= code && code <= 0x1DFF) || // Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
+// 0x200C == code || // Cf ZERO WIDTH NON-JOINER
+// (0x20D0 <= code && code <= 0x20DC) || // Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
+// (0x20DD <= code && code <= 0x20E0) || // Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
+// 0x20E1 == code || // Mn COMBINING LEFT RIGHT ARROW ABOVE
+// (0x20E2 <= code && code <= 0x20E4) || // Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
+// (0x20E5 <= code && code <= 0x20F0) || // Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE
+// (0x2CEF <= code && code <= 0x2CF1) || // Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS
+// 0x2D7F == code || // Mn TIFINAGH CONSONANT JOINER
+// (0x2DE0 <= code && code <= 0x2DFF) || // Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS
+// (0x302A <= code && code <= 0x302D) || // Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
+// (0x302E <= code && code <= 0x302F) || // Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK
+// (0x3099 <= code && code <= 0x309A) || // Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+// 0x_a66F == code || // Mn COMBINING CYRILLIC VZMET
+// (0x_a670 <= code && code <= 0x_a672) || // Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN
+// (0x_a674 <= code && code <= 0x_a67D) || // Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK
+// (0x_a69E <= code && code <= 0x_a69F) || // Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E
+// (0x_a6F0 <= code && code <= 0x_a6F1) || // Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS
+// 0x_a802 == code || // Mn SYLOTI NAGRI SIGN DVISVARA
+// 0x_a806 == code || // Mn SYLOTI NAGRI SIGN HASANTA
+// 0x_a80B == code || // Mn SYLOTI NAGRI SIGN ANUSVARA
+// (0x_a825 <= code && code <= 0x_a826) || // Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E
+// (0x_a8C4 <= code && code <= 0x_a8C5) || // Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU
+// (0x_a8E0 <= code && code <= 0x_a8F1) || // Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA
+// (0x_a926 <= code && code <= 0x_a92D) || // Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU
+// (0x_a947 <= code && code <= 0x_a951) || // Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R
+// (0x_a980 <= code && code <= 0x_a982) || // Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR
+// 0x_a9B3 == code || // Mn JAVANESE SIGN CECAK TELU
+// (0x_a9B6 <= code && code <= 0x_a9B9) || // Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT
+// 0x_a9BC == code || // Mn JAVANESE VOWEL SIGN PEPET
+// 0x_a9E5 == code || // Mn MYANMAR SIGN SHAN SAW
+// (0x_aA29 <= code && code <= 0x_aA2E) || // Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE
+// (0x_aA31 <= code && code <= 0x_aA32) || // Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE
+// (0x_aA35 <= code && code <= 0x_aA36) || // Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA
+// 0x_aA43 == code || // Mn CHAM CONSONANT SIGN FINAL NG
+// 0x_aA4C == code || // Mn CHAM CONSONANT SIGN FINAL M
+// 0x_aA7C == code || // Mn MYANMAR SIGN TAI LAING TONE-2
+// 0x_aAB0 == code || // Mn TAI VIET MAI KANG
+// (0x_aAB2 <= code && code <= 0x_aAB4) || // Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U
+// (0x_aAB7 <= code && code <= 0x_aAB8) || // Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA
+// (0x_aABE <= code && code <= 0x_aABF) || // Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK
+// 0x_aAC1 == code || // Mn TAI VIET TONE MAI THO
+// (0x_aAEC <= code && code <= 0x_aAED) || // Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI
+// 0x_aAF6 == code || // Mn MEETEI MAYEK VIRAMA
+// 0x_aBE5 == code || // Mn MEETEI MAYEK VOWEL SIGN ANAP
+// 0x_aBE8 == code || // Mn MEETEI MAYEK VOWEL SIGN UNAP
+// 0x_aBED == code || // Mn MEETEI MAYEK APUN IYEK
+// 0x_fB1E == code || // Mn HEBREW POINT JUDEO-SPANISH VARIKA
+// (0x_fE00 <= code && code <= 0x_fE0F) || // Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
+// (0x_fE20 <= code && code <= 0x_fE2F) || // Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF
+// (0x_fF9E <= code && code <= 0x_fF9F) || // Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
+// 0x101FD == code || // Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
+// 0x102E0 == code || // Mn COPTIC EPACT THOUSANDS MARK
+// (0x10376 <= code && code <= 0x1037A) || // Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII
+// (0x10A01 <= code && code <= 0x10A03) || // Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
+// (0x10A05 <= code && code <= 0x10A06) || // Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
+// (0x10A0C <= code && code <= 0x10A0F) || // Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
+// (0x10A38 <= code && code <= 0x10A3A) || // Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
+// 0x10A3F == code || // Mn KHAROSHTHI VIRAMA
+// (0x10AE5 <= code && code <= 0x10AE6) || // Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
+// 0x11001 == code || // Mn BRAHMI SIGN ANUSVARA
+// (0x11038 <= code && code <= 0x11046) || // Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA
+//