diff options
Diffstat (limited to 'src/text_processing')
-rw-r--r-- | src/text_processing/grapheme_clusters.rs | 1866 | ||||
-rw-r--r-- | src/text_processing/line_break.rs | 724 | ||||
-rw-r--r-- | src/text_processing/tables.rs | 3410 | ||||
-rw-r--r-- | src/text_processing/types.rs | 123 | ||||
-rw-r--r-- | src/text_processing/wcwidth.rs | 682 |
5 files changed, 6805 insertions, 0 deletions
diff --git a/src/text_processing/grapheme_clusters.rs b/src/text_processing/grapheme_clusters.rs new file mode 100644 index 0000000..ddc61ed --- /dev/null +++ b/src/text_processing/grapheme_clusters.rs @@ -0,0 +1,1866 @@ +/* + * bb + * + * Copyright 2019 Manos Pitsidianakis + * + * This file is part of bb. + * + * bb is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * bb is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with bb. If not, see <http://www.gnu.org/licenses/>. + */ + +/* + Breaks a string into individual user-perceived "characters" + Unicode UAX-29 standard, version 10.0.0 + + Usage: + //returns an array of strings, one string for each grapheme cluster + let mut graphemes = split_graphemes(string); + +*/ + +use crate::wcwidth::{wcwidth, CodePointsIter}; +extern crate unicode_segmentation; +use self::unicode_segmentation::UnicodeSegmentation; + +pub trait Graphemes: UnicodeSegmentation + CodePointsIter { + fn split_graphemes<'a>(&'a self) -> Vec<&'a str> { + UnicodeSegmentation::graphemes(self, true).collect::<Vec<&str>>() + } + + fn graphemes_indices<'a>(&'a self) -> Vec<(usize, &'a str)> { + UnicodeSegmentation::grapheme_indices(self, true).collect::<Vec<(usize, &str)>>() + } + + fn next_grapheme(&self) -> Option<(usize, &str)> { + UnicodeSegmentation::grapheme_indices(self, true).next() + } + + fn last_grapheme(&self) -> Option<(usize, &str)> { + UnicodeSegmentation::grapheme_indices(self, true).next_back() + } + + fn grapheme_width(&self) -> usize { + let mut count = 0; + for c in self.code_points() { + count += if let Some(w) = wcwidth(c) { w } else { 0 }; + } + + count + } + + fn grapheme_len(&self) -> usize { + self.split_graphemes().len() + } +} + +impl Graphemes for str {} + +pub struct WordBreakIter<'s> { + input: &'s str, + width: usize, +} +impl<'s> Iterator for WordBreakIter<'s> { + type Item = &'s str; + + fn next(&mut self) -> Option<&'s str> { + if self.input.is_empty() { + return None; + } + self.input = self.input.trim_start_matches(|c| c == ' '); + if self.input.starts_with('\n') { + let ret = &self.input[0..0]; + self.input = &self.input[1..]; + return Some(ret); + } + if let Some(next_idx) = self.input.as_bytes().iter().position(|&c| c == b'\n') { + if next_idx <= self.width { + let ret = &self.input[..next_idx]; + self.input = &self.input[next_idx + 1..]; + return Some(ret); + } + } + let graphemes = UnicodeSegmentation::grapheme_indices(self.input, true) + .take(self.width) + .collect::<Vec<(usize, &str)>>(); + if graphemes.len() == self.width { + // use grapheme indices and find position of " " graphemes + if let Some(next_idx) = graphemes.iter().rposition(|(_, g)| *g == " ") { + let next_idx = graphemes[next_idx].0; + let ret = &self.input[..next_idx]; + self.input = &self.input[next_idx..]; + Some(ret) + } else { + let last = graphemes.last().unwrap(); + let next_idx = last.0 + last.1.len(); + let ret = &self.input[..next_idx]; + self.input = &self.input[next_idx..]; + Some(ret) + } + } else { + /* graphemes.len() < width */ + let ret = self.input; + self.input = &self.input[0..0]; + Some(ret) + } + } +} + +pub fn word_break_string(s: &str, width: usize) -> Vec<&str> { + let iter = WordBreakIter { input: s, width }; + iter.collect() +} + +//#[derive(PartialEq)] +//enum Property { +// CR, +// LF, +// Control, +// Extend, +// Regional_Indicator, +// SpacingMark, +// L, +// V, +// T, +// LV, +// LVT, +// Other, +// Prepend, +// E_Base, +// E_Modifier, +// ZWJ, +// Glue_After_Zwj, +// E_Base_GAZ, +//} +// +//enum Breaks { +// NotBreak, +// BreakStart, +// Break, +// BreakLastRegional, +// BreakPenultimateRegional, +//} +// +//use Property::*; +//use Breaks::*; +// +//impl From<u8> for Breaks { +// fn from(u: u8) -> Breaks { +// match u { +// 0 => NotBreak, +// 1 => BreakStart, +// 2 => Break, +// 3 => BreakLastRegional, +// 4 => BreakPenultimateRegional, +// _ => unreachable!() +// } +// } +//} +// +//fn is_surrogate(s: &str, pos: usize) -> bool { +// return 0xd800 <= char_code_at(s, pos) && char_code_at(s, pos) <= 0xdbff && +// 0xdc00 <= char_code_at(s, pos + 1) && char_code_at(s, pos + 1) <= 0xdfff; +//} +// +//// Private function, gets a Unicode code point from a java_script UTF-16 string +//// handling surrogate pairs appropriately +//fn code_point_at(s: &str, idx: usize) -> u8 { +// let mut code: u8 = char_code_at(s, idx); +// +// // if a high surrogate +// if (0x_d800 <= code && code <= 0x_dBFF && +// idx < str.length - 1){ +// let mut hi = code; +// let mut low = char_code_at(s, idx + 1); +// if (0x_dC00 <= low && low <= 0x_dFFF){ +// return ((hi - 0x_d800) * 0x400) + (low - 0x_dC00) + 0x10000; +// } +// return hi; +// } +// +// // if a low surrogate +// if (0x_dC00 <= code && code <= 0x_dFFF && +// idx >= 1){ +// let mut hi = char_code_at(s, idx - 1); +// let mut low = code; +// if (0x_d800 <= hi && hi <= 0x_dBFF){ +// return ((hi - 0x_d800) * 0x400) + (low - 0x_dC00) + 0x10000; +// } +// return low; +// } +// +// //just return the char if an unmatched surrogate half or a +// //single-char codepoint +// return code; +//} +// +//// Private function, returns whether a break is allowed between the +//// two given grapheme breaking classes +//fn should_break(start, mid, end) -> Breaks { +// let mut all = [start, mid, end].into(); +// let mut previous = start; +// let mut next = end +// +// // Lookahead termintor for: +// // GB10. (E_Base | EBG) Extend* ? E_Modifier +// let mut e_modifier_index = all.last_index_of(E_Modifier) +// if(e_modifier_index > 1 && +// all.slice(1, e_modifier_index).every(function(c){return c == Extend}) && +// [Extend, E_Base, E_Base_GAZ].index_of(start) == -1){ +// return Break +// } +// +// // Lookahead termintor for: +// // GB12. ^ (RI RI)* RI ? RI +// // GB13. [^RI] (RI RI)* RI ? RI +// let mut r_iIndex = all.last_index_of(Regional_Indicator) +// if(r_iIndex > 0 && +// all.slice(1, r_iIndex).every(function(c){return c == Regional_Indicator}) && +// [Prepend, Regional_Indicator].index_of(previous) == -1) { +// if(all.filter(function(c){return c == Regional_Indicator}).length % 2 == 1) { +// return BreakLastRegional +// } +// else { +// return BreakPenultimateRegional +// } +// } +// +// // GB3. CR X LF +// if(previous == CR && next == LF){ +// return NotBreak; +// } +// // GB4. (Control|CR|LF) ÷ +// else if(previous == Control || previous == CR || previous == LF){ +// if(next == E_Modifier && mid.every(function(c){return c == Extend})){ +// return Break +// } +// else { +// return BreakStart +// } +// } +// // GB5. ÷ (Control|CR|LF) +// else if(next == Control || next == CR || next == LF){ +// return BreakStart; +// } +// // GB6. L X (L|V|LV|LVT) +// else if(previous == L && +// (next == L || next == V || next == LV || next == LVT)){ +// return NotBreak; +// } +// // GB7. (LV|V) X (V|T) +// else if((previous == LV || previous == V) && +// (next == V || next == T)){ +// return NotBreak; +// } +// // GB8. (LVT|T) X (T) +// else if((previous == LVT || previous == T) && +// next == T){ +// return NotBreak; +// } +// // GB9. X (Extend|ZWJ) +// else if (next == Extend || next == ZWJ){ +// return NotBreak; +// } +// // GB9a. X SpacingMark +// else if(next == SpacingMark){ +// return NotBreak; +// } +// // GB9b. Prepend X +// else if (previous == Prepend){ +// return NotBreak; +// } +// +// // GB10. (E_Base | EBG) Extend* ? E_Modifier +// let mut previous_non_extend_index = all.index_of(Extend) != -1 ? all.last_index_of(Extend) - 1 : all.length - 2; +// if([E_Base, E_Base_GAZ].index_of(all[previous_non_extend_index]) != -1 && +// all.slice(previous_non_extend_index + 1, -1).every(function(c){return c == Extend}) && +// next == E_Modifier){ +// return NotBreak; +// } +// +// // GB11. ZWJ ? (Glue_After_Zwj | EBG) +// if(previous == ZWJ && [Glue_After_Zwj, E_Base_GAZ].index_of(next) != -1) { +// return NotBreak; +// } +// +// // GB12. ^ (RI RI)* RI ? RI +// // GB13. [^RI] (RI RI)* RI ? RI +// if(mid.index_of(Regional_Indicator) != -1) { +// return Break; +// } +// if(previous == Regional_Indicator && next == Regional_Indicator) { +// return NotBreak; +// } +// +// // GB999. Any ? Any +// return BreakStart; +//} +// +//// Returns the next grapheme break in the string after the given index +//fn next_break(s: &str, index: usize) -> Breaks { +// // if(index < 0){ +// // return 0; +// // } +// if(index >= s.len() - 1){ +// return s.len().into(); +// } +// let mut prev = get_grapheme_break_property(code_point_at(s, index)); +// let mut mid = [] +// for (let mut i = index + 1; i < s.len(); i++) { +// // check for already processed low surrogates +// if(is_surrogate(string, i - 1)){ +// continue; +// } +// +// let mut next = get_grapheme_break_property(code_point_at(s, i)); +// if(should_break(prev, mid, next)){ +// return i.into(); +// } +// +// mid.push(next); +// } +// return s.len().into(); +//}; +// +//// Breaks the given string into an array of grapheme cluster strings +//fn split_graphemes(s: &str) -> Vec<&str> { +// let mut res = Vec::new() +// let mut index = 0; +// +// while let Some(brk) = next_break(s, index) { +// res.push(&s[index..brk]); +// index = brk; +// } +// +// if(index < s.len()){ +// res.push(&s[index..]); +// } +// +// return res; +//}; +// +//// Returns the iterator of grapheme clusters there are in the given string +////fn iterate_graphemes(s: &str) { +//// let mut index = 0; +//// let mut res = { +//// next: (function() { +//// let mut value; +//// let mut brk; +//// if ((brk = this.next_break(str, index)) < str.length) { +//// value = str.slice(index, brk); +//// index = brk; +//// return { value: value, done: false }; +//// } +//// if (index < str.length) { +//// value = str.slice(index); +//// index = str.length; +//// return { value: value, done: false }; +//// } +//// return { value: undefined, done: true }; +//// }).bind(this) +//// }; +//// // ES2015 @@iterator method (iterable) for spread syntax and for...of statement +//// if (typeof Symbol !== 'undefined' && Symbol.iterator) { +//// res[Symbol.iterator] = function() {return res}; +//// } +//// return res; +////}; +// +//// Returns the number of grapheme clusters there are in the given string +//fn count_graphemes(s:&str) -> usize { +// let mut count = 0; +// let mut index = 0; +// while let Some(brk) = next_break(s, index) { +// index = brk; +// count++; +// } +// +// if(index < s.len()){ +// count++; +// } +// +// return count; +//}; +// +////given a Unicode code point, determines this symbol's grapheme break property +//fn get_grapheme_break_property(code: u8) -> Property { +// +// //grapheme break property for Unicode 10.0.0, +// //taken from http://www.unicode.org/Public/10.0.0/ucd/auxiliary/grapheme_break_property.txt +// //and adapted to java_script rules +// +// if( +// (0x0600 <= code && code <= 0x0605) || // Cf [6] ARABIC NUMBER SIGN..ARABIC NUMBER MARK ABOVE +// 0x06DD == code || // Cf ARABIC END OF AYAH +// 0x070F == code || // Cf SYRIAC ABBREVIATION MARK +// 0x08E2 == code || // Cf ARABIC DISPUTED END OF AYAH +// 0x0D4E == code || // Lo MALAYALAM LETTER DOT REPH +// 0x110BD == code || // Cf KAITHI NUMBER SIGN +// (0x111C2 <= code && code <= 0x111C3) || // Lo [2] SHARADA SIGN JIHVAMULIYA..SHARADA SIGN UPADHMANIYA +// 0x11A3A == code || // Lo ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA +// (0x11A86 <= code && code <= 0x11A89) || // Lo [4] SOYOMBO CLUSTER-INITIAL LETTER RA..SOYOMBO CLUSTER-INITIAL LETTER SA +// 0x11D46 == code // Lo MASARAM GONDI REPHA +// ){ +// return Prepend; +// } +// if( +// 0x000D == code // Cc <control-000D> +// ){ +// return CR; +// } +// +// if( +// 0x000A == code // Cc <control-000A> +// ){ +// return LF; +// } +// +// +// if( +// (0x0000 <= code && code <= 0x0009) || // Cc [10] <control-0000>..<control-0009> +// (0x000B <= code && code <= 0x000C) || // Cc [2] <control-000B>..<control-000C> +// (0x000E <= code && code <= 0x001F) || // Cc [18] <control-000E>..<control-001F> +// (0x007F <= code && code <= 0x009F) || // Cc [33] <control-007F>..<control-009F> +// 0x00AD == code || // Cf SOFT HYPHEN +// 0x061C == code || // Cf ARABIC LETTER MARK +// +// 0x180E == code || // Cf MONGOLIAN VOWEL SEPARATOR +// 0x200B == code || // Cf ZERO WIDTH SPACE +// (0x200E <= code && code <= 0x200F) || // Cf [2] LEFT-TO-RIGHT MARK..RIGHT-TO-LEFT MARK +// 0x2028 == code || // Zl LINE SEPARATOR +// 0x2029 == code || // Zp PARAGRAPH SEPARATOR +// (0x202A <= code && code <= 0x202E) || // Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE +// (0x2060 <= code && code <= 0x2064) || // Cf [5] WORD JOINER..INVISIBLE PLUS +// 0x2065 == code || // Cn <reserved-2065> +// (0x2066 <= code && code <= 0x206F) || // Cf [10] LEFT-TO-RIGHT ISOLATE..NOMINAL DIGIT SHAPES +// (0x_d800 <= code && code <= 0x_dFFF) || // Cs [2048] <surrogate-D800>..<surrogate-DFFF> +// 0x_fEFF == code || // Cf ZERO WIDTH NO-BREAK SPACE +// (0x_fFF0 <= code && code <= 0x_fFF8) || // Cn [9] <reserved-FFF0>..<reserved-FFF8> +// (0x_fFF9 <= code && code <= 0x_fFFB) || // Cf [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR +// (0x1BCA0 <= code && code <= 0x1BCA3) || // Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP +// (0x1D173 <= code && code <= 0x1D17A) || // Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE +// 0x_e0000 == code || // Cn <reserved-E0000> +// 0x_e0001 == code || // Cf LANGUAGE TAG +// (0x_e0002 <= code && code <= 0x_e001F) || // Cn [30] <reserved-E0002>..<reserved-E001F> +// (0x_e0080 <= code && code <= 0x_e00FF) || // Cn [128] <reserved-E0080>..<reserved-E00FF> +// (0x_e01F0 <= code && code <= 0x_e0FFF) // Cn [3600] <reserved-E01F0>..<reserved-E0FFF> +// ){ +// return Control; +// } +// +// +// if( +// (0x0300 <= code && code <= 0x036F) || // Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X +// (0x0483 <= code && code <= 0x0487) || // Mn [5] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC POKRYTIE +// (0x0488 <= code && code <= 0x0489) || // Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN +// (0x0591 <= code && code <= 0x05BD) || // Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG +// 0x05BF == code || // Mn HEBREW POINT RAFE +// (0x05C1 <= code && code <= 0x05C2) || // Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT +// (0x05C4 <= code && code <= 0x05C5) || // Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT +// 0x05C7 == code || // Mn HEBREW POINT QAMATS QATAN +// (0x0610 <= code && code <= 0x061A) || // Mn [11] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL KASRA +// (0x064B <= code && code <= 0x065F) || // Mn [21] ARABIC FATHATAN..ARABIC WAVY HAMZA BELOW +// 0x0670 == code || // Mn ARABIC LETTER SUPERSCRIPT ALEF +// (0x06D6 <= code && code <= 0x06DC) || // Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN +// (0x06DF <= code && code <= 0x06E4) || // Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA +// (0x06E7 <= code && code <= 0x06E8) || // Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON +// (0x06EA <= code && code <= 0x06ED) || // Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM +// 0x0711 == code || // Mn SYRIAC LETTER SUPERSCRIPT ALAPH +// (0x0730 <= code && code <= 0x074A) || // Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH +// (0x07A6 <= code && code <= 0x07B0) || // Mn [11] THAANA ABAFILI..THAANA SUKUN +// (0x07EB <= code && code <= 0x07F3) || // Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE +// (0x0816 <= code && code <= 0x0819) || // Mn [4] SAMARITAN MARK IN..SAMARITAN MARK DAGESH +// (0x081B <= code && code <= 0x0823) || // Mn [9] SAMARITAN MARK EPENTHETIC YUT..SAMARITAN VOWEL SIGN A +// (0x0825 <= code && code <= 0x0827) || // Mn [3] SAMARITAN VOWEL SIGN SHORT A..SAMARITAN VOWEL SIGN U +// (0x0829 <= code && code <= 0x082D) || // Mn [5] SAMARITAN VOWEL SIGN LONG I..SAMARITAN MARK NEQUDAA +// (0x0859 <= code && code <= 0x085B) || // Mn [3] MANDAIC AFFRICATION MARK..MANDAIC GEMINATION MARK +// (0x08D4 <= code && code <= 0x08E1) || // Mn [14] ARABIC SMALL HIGH WORD AR-RUB..ARABIC SMALL HIGH SIGN SAFHA +// (0x08E3 <= code && code <= 0x0902) || // Mn [32] ARABIC TURNED DAMMA BELOW..DEVANAGARI SIGN ANUSVARA +// 0x093A == code || // Mn DEVANAGARI VOWEL SIGN OE +// 0x093C == code || // Mn DEVANAGARI SIGN NUKTA +// (0x0941 <= code && code <= 0x0948) || // Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI +// 0x094D == code || // Mn DEVANAGARI SIGN VIRAMA +// (0x0951 <= code && code <= 0x0957) || // Mn [7] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI VOWEL SIGN UUE +// (0x0962 <= code && code <= 0x0963) || // Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL +// 0x0981 == code || // Mn BENGALI SIGN CANDRABINDU +// 0x09BC == code || // Mn BENGALI SIGN NUKTA +// 0x09BE == code || // Mc BENGALI VOWEL SIGN AA +// (0x09C1 <= code && code <= 0x09C4) || // Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR +// 0x09CD == code || // Mn BENGALI SIGN VIRAMA +// 0x09D7 == code || // Mc BENGALI AU LENGTH MARK +// (0x09E2 <= code && code <= 0x09E3) || // Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL +// (0x0A01 <= code && code <= 0x0A02) || // Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI +// 0x0A3C == code || // Mn GURMUKHI SIGN NUKTA +// (0x0A41 <= code && code <= 0x0A42) || // Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU +// (0x0A47 <= code && code <= 0x0A48) || // Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI +// (0x0A4B <= code && code <= 0x0A4D) || // Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA +// 0x0A51 == code || // Mn GURMUKHI SIGN UDAAT +// (0x0A70 <= code && code <= 0x0A71) || // Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK +// 0x0A75 == code || // Mn GURMUKHI SIGN YAKASH +// (0x0A81 <= code && code <= 0x0A82) || // Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA +// 0x0ABC == code || // Mn GUJARATI SIGN NUKTA +// (0x0AC1 <= code && code <= 0x0AC5) || // Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E +// (0x0AC7 <= code && code <= 0x0AC8) || // Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI +// 0x0ACD == code || // Mn GUJARATI SIGN VIRAMA +// (0x0AE2 <= code && code <= 0x0AE3) || // Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL +// (0x0AFA <= code && code <= 0x0AFF) || // Mn [6] GUJARATI SIGN SUKUN..GUJARATI SIGN TWO-CIRCLE NUKTA ABOVE +// 0x0B01 == code || // Mn ORIYA SIGN CANDRABINDU +// 0x0B3C == code || // Mn ORIYA SIGN NUKTA +// 0x0B3E == code || // Mc ORIYA VOWEL SIGN AA +// 0x0B3F == code || // Mn ORIYA VOWEL SIGN I +// (0x0B41 <= code && code <= 0x0B44) || // Mn [4] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC RR +// 0x0B4D == code || // Mn ORIYA SIGN VIRAMA +// 0x0B56 == code || // Mn ORIYA AI LENGTH MARK +// 0x0B57 == code || // Mc ORIYA AU LENGTH MARK +// (0x0B62 <= code && code <= 0x0B63) || // Mn [2] ORIYA VOWEL SIGN VOCALIC L..ORIYA VOWEL SIGN VOCALIC LL +// 0x0B82 == code || // Mn TAMIL SIGN ANUSVARA +// 0x0BBE == code || // Mc TAMIL VOWEL SIGN AA +// 0x0BC0 == code || // Mn TAMIL VOWEL SIGN II +// 0x0BCD == code || // Mn TAMIL SIGN VIRAMA +// 0x0BD7 == code || // Mc TAMIL AU LENGTH MARK +// 0x0C00 == code || // Mn TELUGU SIGN COMBINING CANDRABINDU ABOVE +// (0x0C3E <= code && code <= 0x0C40) || // Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II +// (0x0C46 <= code && code <= 0x0C48) || // Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI +// (0x0C4A <= code && code <= 0x0C4D) || // Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA +// (0x0C55 <= code && code <= 0x0C56) || // Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK +// (0x0C62 <= code && code <= 0x0C63) || // Mn [2] TELUGU VOWEL SIGN VOCALIC L..TELUGU VOWEL SIGN VOCALIC LL +// 0x0C81 == code || // Mn KANNADA SIGN CANDRABINDU +// 0x0CBC == code || // Mn KANNADA SIGN NUKTA +// 0x0CBF == code || // Mn KANNADA VOWEL SIGN I +// 0x0CC2 == code || // Mc KANNADA VOWEL SIGN UU +// 0x0CC6 == code || // Mn KANNADA VOWEL SIGN E +// (0x0CCC <= code && code <= 0x0CCD) || // Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA +// (0x0CD5 <= code && code <= 0x0CD6) || // Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK +// (0x0CE2 <= code && code <= 0x0CE3) || // Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL +// (0x0D00 <= code && code <= 0x0D01) || // Mn [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU +// (0x0D3B <= code && code <= 0x0D3C) || // Mn [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA +// 0x0D3E == code || // Mc MALAYALAM VOWEL SIGN AA +// (0x0D41 <= code && code <= 0x0D44) || // Mn [4] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC RR +// 0x0D4D == code || // Mn MALAYALAM SIGN VIRAMA +// 0x0D57 == code || // Mc MALAYALAM AU LENGTH MARK +// (0x0D62 <= code && code <= 0x0D63) || // Mn [2] MALAYALAM VOWEL SIGN VOCALIC L..MALAYALAM VOWEL SIGN VOCALIC LL +// 0x0DCA == code || // Mn SINHALA SIGN AL-LAKUNA +// 0x0DCF == code || // Mc SINHALA VOWEL SIGN AELA-PILLA +// (0x0DD2 <= code && code <= 0x0DD4) || // Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA +// 0x0DD6 == code || // Mn SINHALA VOWEL SIGN DIGA PAA-PILLA +// 0x0DDF == code || // Mc SINHALA VOWEL SIGN GAYANUKITTA +// 0x0E31 == code || // Mn THAI CHARACTER MAI HAN-AKAT +// (0x0E34 <= code && code <= 0x0E3A) || // Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU +// (0x0E47 <= code && code <= 0x0E4E) || // Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN +// 0x0EB1 == code || // Mn LAO VOWEL SIGN MAI KAN +// (0x0EB4 <= code && code <= 0x0EB9) || // Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU +// (0x0EBB <= code && code <= 0x0EBC) || // Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO +// (0x0EC8 <= code && code <= 0x0ECD) || // Mn [6] LAO TONE MAI EK..LAO NIGGAHITA +// (0x0F18 <= code && code <= 0x0F19) || // Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS +// 0x0F35 == code || // Mn TIBETAN MARK NGAS BZUNG NYI ZLA +// 0x0F37 == code || // Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS +// 0x0F39 == code || // Mn TIBETAN MARK TSA -PHRU +// (0x0F71 <= code && code <= 0x0F7E) || // Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO +// (0x0F80 <= code && code <= 0x0F84) || // Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA +// (0x0F86 <= code && code <= 0x0F87) || // Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS +// (0x0F8D <= code && code <= 0x0F97) || // Mn [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA +// (0x0F99 <= code && code <= 0x0FBC) || // Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA +// 0x0FC6 == code || // Mn TIBETAN SYMBOL PADMA GDAN +// (0x102D <= code && code <= 0x1030) || // Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU +// (0x1032 <= code && code <= 0x1037) || // Mn [6] MYANMAR VOWEL SIGN AI..MYANMAR SIGN DOT BELOW +// (0x1039 <= code && code <= 0x103A) || // Mn [2] MYANMAR SIGN VIRAMA..MYANMAR SIGN ASAT +// (0x103D <= code && code <= 0x103E) || // Mn [2] MYANMAR CONSONANT SIGN MEDIAL WA..MYANMAR CONSONANT SIGN MEDIAL HA +// (0x1058 <= code && code <= 0x1059) || // Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL +// (0x105E <= code && code <= 0x1060) || // Mn [3] MYANMAR CONSONANT SIGN MON MEDIAL NA..MYANMAR CONSONANT SIGN MON MEDIAL LA +// (0x1071 <= code && code <= 0x1074) || // Mn [4] MYANMAR VOWEL SIGN GEBA KAREN I..MYANMAR VOWEL SIGN KAYAH EE +// 0x1082 == code || // Mn MYANMAR CONSONANT SIGN SHAN MEDIAL WA +// (0x1085 <= code && code <= 0x1086) || // Mn [2] MYANMAR VOWEL SIGN SHAN E ABOVE..MYANMAR VOWEL SIGN SHAN FINAL Y +// 0x108D == code || // Mn MYANMAR SIGN SHAN COUNCIL EMPHATIC TONE +// 0x109D == code || // Mn MYANMAR VOWEL SIGN AITON AI +// (0x135D <= code && code <= 0x135F) || // Mn [3] ETHIOPIC COMBINING GEMINATION AND VOWEL LENGTH MARK..ETHIOPIC COMBINING GEMINATION MARK +// (0x1712 <= code && code <= 0x1714) || // Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA +// (0x1732 <= code && code <= 0x1734) || // Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD +// (0x1752 <= code && code <= 0x1753) || // Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U +// (0x1772 <= code && code <= 0x1773) || // Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U +// (0x17B4 <= code && code <= 0x17B5) || // Mn [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA +// (0x17B7 <= code && code <= 0x17BD) || // Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA +// 0x17C6 == code || // Mn KHMER SIGN NIKAHIT +// (0x17C9 <= code && code <= 0x17D3) || // Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT +// 0x17DD == code || // Mn KHMER SIGN ATTHACAN +// (0x180B <= code && code <= 0x180D) || // Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE +// (0x1885 <= code && code <= 0x1886) || // Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA +// 0x18A9 == code || // Mn MONGOLIAN LETTER ALI GALI DAGALGA +// (0x1920 <= code && code <= 0x1922) || // Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U +// (0x1927 <= code && code <= 0x1928) || // Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O +// 0x1932 == code || // Mn LIMBU SMALL LETTER ANUSVARA +// (0x1939 <= code && code <= 0x193B) || // Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I +// (0x1A17 <= code && code <= 0x1A18) || // Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U +// 0x1A1B == code || // Mn BUGINESE VOWEL SIGN AE +// 0x1A56 == code || // Mn TAI THAM CONSONANT SIGN MEDIAL LA +// (0x1A58 <= code && code <= 0x1A5E) || // Mn [7] TAI THAM SIGN MAI KANG LAI..TAI THAM CONSONANT SIGN SA +// 0x1A60 == code || // Mn TAI THAM SIGN SAKOT +// 0x1A62 == code || // Mn TAI THAM VOWEL SIGN MAI SAT +// (0x1A65 <= code && code <= 0x1A6C) || // Mn [8] TAI THAM VOWEL SIGN I..TAI THAM VOWEL SIGN OA BELOW +// (0x1A73 <= code && code <= 0x1A7C) || // Mn [10] TAI THAM VOWEL SIGN OA ABOVE..TAI THAM SIGN KHUEN-LUE KARAN +// 0x1A7F == code || // Mn TAI THAM COMBINING CRYPTOGRAMMIC DOT +// (0x1AB0 <= code && code <= 0x1ABD) || // Mn [14] COMBINING DOUBLED CIRCUMFLEX ACCENT..COMBINING PARENTHESES BELOW +// 0x1ABE == code || // Me COMBINING PARENTHESES OVERLAY +// (0x1B00 <= code && code <= 0x1B03) || // Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG +// 0x1B34 == code || // Mn BALINESE SIGN REREKAN +// (0x1B36 <= code && code <= 0x1B3A) || // Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA +// 0x1B3C == code || // Mn BALINESE VOWEL SIGN LA LENGA +// 0x1B42 == code || // Mn BALINESE VOWEL SIGN PEPET +// (0x1B6B <= code && code <= 0x1B73) || // Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG +// (0x1B80 <= code && code <= 0x1B81) || // Mn [2] SUNDANESE SIGN PANYECEK..SUNDANESE SIGN PANGLAYAR +// (0x1BA2 <= code && code <= 0x1BA5) || // Mn [4] SUNDANESE CONSONANT SIGN PANYAKRA..SUNDANESE VOWEL SIGN PANYUKU +// (0x1BA8 <= code && code <= 0x1BA9) || // Mn [2] SUNDANESE VOWEL SIGN PAMEPET..SUNDANESE VOWEL SIGN PANEULEUNG +// (0x1BAB <= code && code <= 0x1BAD) || // Mn [3] SUNDANESE SIGN VIRAMA..SUNDANESE CONSONANT SIGN PASANGAN WA +// 0x1BE6 == code || // Mn BATAK SIGN TOMPI +// (0x1BE8 <= code && code <= 0x1BE9) || // Mn [2] BATAK VOWEL SIGN PAKPAK E..BATAK VOWEL SIGN EE +// 0x1BED == code || // Mn BATAK VOWEL SIGN KARO O +// (0x1BEF <= code && code <= 0x1BF1) || // Mn [3] BATAK VOWEL SIGN U FOR SIMALUNGUN SA..BATAK CONSONANT SIGN H +// (0x1C2C <= code && code <= 0x1C33) || // Mn [8] LEPCHA VOWEL SIGN E..LEPCHA CONSONANT SIGN T +// (0x1C36 <= code && code <= 0x1C37) || // Mn [2] LEPCHA SIGN RAN..LEPCHA SIGN NUKTA +// (0x1CD0 <= code && code <= 0x1CD2) || // Mn [3] VEDIC TONE KARSHANA..VEDIC TONE PRENKHA +// (0x1CD4 <= code && code <= 0x1CE0) || // Mn [13] VEDIC SIGN YAJURVEDIC MIDLINE SVARITA..VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA +// (0x1CE2 <= code && code <= 0x1CE8) || // Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL +// 0x1CED == code || // Mn VEDIC SIGN TIRYAK +// 0x1CF4 == code || // Mn VEDIC TONE CANDRA ABOVE +// (0x1CF8 <= code && code <= 0x1CF9) || // Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE +// (0x1DC0 <= code && code <= 0x1DF9) || // Mn [58] COMBINING DOTTED GRAVE ACCENT..COMBINING WIDE INVERTED BRIDGE BELOW +// (0x1DFB <= code && code <= 0x1DFF) || // Mn [5] COMBINING DELETION MARK..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW +// 0x200C == code || // Cf ZERO WIDTH NON-JOINER +// (0x20D0 <= code && code <= 0x20DC) || // Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE +// (0x20DD <= code && code <= 0x20E0) || // Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH +// 0x20E1 == code || // Mn COMBINING LEFT RIGHT ARROW ABOVE +// (0x20E2 <= code && code <= 0x20E4) || // Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE +// (0x20E5 <= code && code <= 0x20F0) || // Mn [12] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING ASTERISK ABOVE +// (0x2CEF <= code && code <= 0x2CF1) || // Mn [3] COPTIC COMBINING NI ABOVE..COPTIC COMBINING SPIRITUS LENIS +// 0x2D7F == code || // Mn TIFINAGH CONSONANT JOINER +// (0x2DE0 <= code && code <= 0x2DFF) || // Mn [32] COMBINING CYRILLIC LETTER BE..COMBINING CYRILLIC LETTER IOTIFIED BIG YUS +// (0x302A <= code && code <= 0x302D) || // Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK +// (0x302E <= code && code <= 0x302F) || // Mc [2] HANGUL SINGLE DOT TONE MARK..HANGUL DOUBLE DOT TONE MARK +// (0x3099 <= code && code <= 0x309A) || // Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK +// 0x_a66F == code || // Mn COMBINING CYRILLIC VZMET +// (0x_a670 <= code && code <= 0x_a672) || // Me [3] COMBINING CYRILLIC TEN MILLIONS SIGN..COMBINING CYRILLIC THOUSAND MILLIONS SIGN +// (0x_a674 <= code && code <= 0x_a67D) || // Mn [10] COMBINING CYRILLIC LETTER UKRAINIAN IE..COMBINING CYRILLIC PAYEROK +// (0x_a69E <= code && code <= 0x_a69F) || // Mn [2] COMBINING CYRILLIC LETTER EF..COMBINING CYRILLIC LETTER IOTIFIED E +// (0x_a6F0 <= code && code <= 0x_a6F1) || // Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM COMBINING MARK TUKWENTIS +// 0x_a802 == code || // Mn SYLOTI NAGRI SIGN DVISVARA +// 0x_a806 == code || // Mn SYLOTI NAGRI SIGN HASANTA +// 0x_a80B == code || // Mn SYLOTI NAGRI SIGN ANUSVARA +// (0x_a825 <= code && code <= 0x_a826) || // Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E +// (0x_a8C4 <= code && code <= 0x_a8C5) || // Mn [2] SAURASHTRA SIGN VIRAMA..SAURASHTRA SIGN CANDRABINDU +// (0x_a8E0 <= code && code <= 0x_a8F1) || // Mn [18] COMBINING DEVANAGARI DIGIT ZERO..COMBINING DEVANAGARI SIGN AVAGRAHA +// (0x_a926 <= code && code <= 0x_a92D) || // Mn [8] KAYAH LI VOWEL UE..KAYAH LI TONE CALYA PLOPHU +// (0x_a947 <= code && code <= 0x_a951) || // Mn [11] REJANG VOWEL SIGN I..REJANG CONSONANT SIGN R +// (0x_a980 <= code && code <= 0x_a982) || // Mn [3] JAVANESE SIGN PANYANGGA..JAVANESE SIGN LAYAR +// 0x_a9B3 == code || // Mn JAVANESE SIGN CECAK TELU +// (0x_a9B6 <= code && code <= 0x_a9B9) || // Mn [4] JAVANESE VOWEL SIGN WULU..JAVANESE VOWEL SIGN SUKU MENDUT +// 0x_a9BC == code || // Mn JAVANESE VOWEL SIGN PEPET +// 0x_a9E5 == code || // Mn MYANMAR SIGN SHAN SAW +// (0x_aA29 <= code && code <= 0x_aA2E) || // Mn [6] CHAM VOWEL SIGN AA..CHAM VOWEL SIGN OE +// (0x_aA31 <= code && code <= 0x_aA32) || // Mn [2] CHAM VOWEL SIGN AU..CHAM VOWEL SIGN UE +// (0x_aA35 <= code && code <= 0x_aA36) || // Mn [2] CHAM CONSONANT SIGN LA..CHAM CONSONANT SIGN WA +// 0x_aA43 == code || // Mn CHAM CONSONANT SIGN FINAL NG +// 0x_aA4C == code || // Mn CHAM CONSONANT SIGN FINAL M +// 0x_aA7C == code || // Mn MYANMAR SIGN TAI LAING TONE-2 +// 0x_aAB0 == code || // Mn TAI VIET MAI KANG +// (0x_aAB2 <= code && code <= 0x_aAB4) || // Mn [3] TAI VIET VOWEL I..TAI VIET VOWEL U +// (0x_aAB7 <= code && code <= 0x_aAB8) || // Mn [2] TAI VIET MAI KHIT..TAI VIET VOWEL IA +// (0x_aABE <= code && code <= 0x_aABF) || // Mn [2] TAI VIET VOWEL AM..TAI VIET TONE MAI EK +// 0x_aAC1 == code || // Mn TAI VIET TONE MAI THO +// (0x_aAEC <= code && code <= 0x_aAED) || // Mn [2] MEETEI MAYEK VOWEL SIGN UU..MEETEI MAYEK VOWEL SIGN AAI +// 0x_aAF6 == code || // Mn MEETEI MAYEK VIRAMA +// 0x_aBE5 == code || // Mn MEETEI MAYEK VOWEL SIGN ANAP +// 0x_aBE8 == code || // Mn MEETEI MAYEK VOWEL SIGN UNAP +// 0x_aBED == code || // Mn MEETEI MAYEK APUN IYEK +// 0x_fB1E == code || // Mn HEBREW POINT JUDEO-SPANISH VARIKA +// (0x_fE00 <= code && code <= 0x_fE0F) || // Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 +// (0x_fE20 <= code && code <= 0x_fE2F) || // Mn [16] COMBINING LIGATURE LEFT HALF..COMBINING CYRILLIC TITLO RIGHT HALF +// (0x_fF9E <= code && code <= 0x_fF9F) || // Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +// 0x101FD == code || // Mn PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE +// 0x102E0 == code || // Mn COPTIC EPACT THOUSANDS MARK +// (0x10376 <= code && code <= 0x1037A) || // Mn [5] COMBINING OLD PERMIC LETTER AN..COMBINING OLD PERMIC LETTER SII +// (0x10A01 <= code && code <= 0x10A03) || // Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R +// (0x10A05 <= code && code <= 0x10A06) || // Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O +// (0x10A0C <= code && code <= 0x10A0F) || // Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA +// (0x10A38 <= code && code <= 0x10A3A) || // Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW +// 0x10A3F == code || // Mn KHAROSHTHI VIRAMA +// (0x10AE5 <= code && code <= 0x10AE6) || // Mn [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW +// 0x11001 == code || // Mn BRAHMI SIGN ANUSVARA +// (0x11038 <= code && code <= 0x11046) || // Mn [15] BRAHMI VOWEL SIGN AA..BRAHMI VIRAMA +// (0x1107F <= code && code <= 0x11081) || // Mn [3] BRAHMI NUMBER JOINER..KAITHI SIGN ANUSVARA< |