diff options
author | Manos Pitsidianakis <el13635@mail.ntua.gr> | 2020-11-21 00:57:27 +0200 |
---|---|---|
committer | Manos Pitsidianakis <el13635@mail.ntua.gr> | 2020-11-21 02:09:18 +0200 |
commit | 1da6d75b0888d393885104409ce7350881e4ec53 (patch) | |
tree | fb8efd6dcb56d85784a2ef01a81a39cf5be36d30 /melib | |
parent | a7c0bca8cef29c9f59beec74fc9b1a6bdfe2b729 (diff) |
melib/text_processing: add new wcwidth implementation
Download and parse Unicode data files to judge code point width.
Inspired by https://github.com/ridiculousfish/widecharwidth/
Diffstat (limited to 'melib')
-rw-r--r-- | melib/build.rs | 318 | ||||
-rw-r--r-- | melib/src/text_processing/tables.rs | 1352 | ||||
-rw-r--r-- | melib/src/text_processing/wcwidth.rs | 549 |
3 files changed, 1692 insertions, 527 deletions
diff --git a/melib/build.rs b/melib/build.rs index 17570c0b..7fa5ad96 100644 --- a/melib/build.rs +++ b/melib/build.rs @@ -25,6 +25,9 @@ include!("src/text_processing/types.rs"); fn main() -> Result<(), std::io::Error> { #[cfg(feature = "unicode_algorithms")] { + const MOD_PATH: &str = "src/text_processing/tables.rs"; + println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rerun-if-changed={}", MOD_PATH); /* Line break tables */ use std::fs::File; use std::io::prelude::*; @@ -33,8 +36,15 @@ fn main() -> Result<(), std::io::Error> { use std::process::{Command, Stdio}; const LINE_BREAK_TABLE_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt"; + /* Grapheme width tables */ + const UNICODE_DATA_URL: &str = + "http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"; + const EAW_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt"; + const EMOJI_DATA_URL: &str = + "https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt"; - let mod_path = Path::new("src/text_processing/tables.rs"); + + let mod_path = Path::new(MOD_PATH); if mod_path.exists() { eprintln!( "{} already exists, delete it if you want to replace it.", @@ -66,17 +76,255 @@ fn main() -> Result<(), std::io::Error> { let mut codepoint_iter = chars_str.split(".."); let first_codepoint: u32 = - u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap(); + u32::from_str_radix(codepoint_iter.next().unwrap(), 16).unwrap(); let sec_codepoint: u32 = codepoint_iter .next() - .map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap()) + .map(|v| u32::from_str_radix(v, 16).unwrap()) .unwrap_or(first_codepoint); let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2]; line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class))); } child.wait()?; + let child = Command::new("curl") + .args(&["-o", "-", UNICODE_DATA_URL]) + .stdout(Stdio::piped()) + .output()?; + + let unicode_data = String::from_utf8_lossy(&child.stdout); + + let child = Command::new("curl") + .args(&["-o", "-", EAW_URL]) + .stdout(Stdio::piped()) + .output()?; + + let eaw_data = String::from_utf8_lossy(&child.stdout); + + let child = Command::new("curl") + .args(&["-o", "-", EMOJI_DATA_URL]) + .stdout(Stdio::piped()) + .output()?; + + let emoji_data = String::from_utf8_lossy(&child.stdout); + + const MAX_CODEPOINT: usize = 0x110000; + // See https://www.unicode.org/L2/L1999/UnicodeData.html + const FIELD_CODEPOINT: usize = 0; + const FIELD_CATEGORY: usize = 2; + // Ambiguous East Asian characters + const WIDTH_AMBIGUOUS_EASTASIAN: isize = -3; + + // Width changed from 1 to 2 in Unicode 9.0 + const WIDTH_WIDENED_IN_9: isize = -6; + // Category for unassigned codepoints. + const CAT_UNASSIGNED: &str = "Cn"; + + // Category for private use codepoints. + const CAT_PRIVATE_USE: &str = "Co"; + + // Category for surrogates. + const CAT_SURROGATE: &str = "Cs"; + + struct Codepoint<'cat> { + raw: u32, + width: Option<isize>, + category: &'cat str, + } + + let mut codepoints: Vec<Codepoint> = Vec::with_capacity(MAX_CODEPOINT + 1); + for i in 0..=MAX_CODEPOINT { + codepoints.push(Codepoint { + raw: i as u32, + width: None, + category: CAT_UNASSIGNED, + }); + } + + set_general_categories(&mut codepoints, &unicode_data); + set_eaw_widths(&mut codepoints, &eaw_data); + set_emoji_widths(&mut codepoints, &emoji_data); + set_hardcoded_ranges(&mut codepoints); + fn hexrange_to_range(hexrange: &str) -> std::ops::Range<usize> { + /* Given a string like 1F300..1F320 representing an inclusive range, + return the range of codepoints. + If the string is like 1F321, return a range of just that element. + */ + let hexrange = hexrange.trim(); + let fields = hexrange + .split("..") + .map(|h| usize::from_str_radix(h.trim(), 16).unwrap()) + .collect::<Vec<usize>>(); + if fields.len() == 1 { + fields[0]..(fields[0] + 1) + } else { + fields[0]..(fields[1] + 1) + } + } + + fn set_general_categories<'u>(codepoints: &mut Vec<Codepoint<'u>>, unicode_data: &'u str) { + for line in unicode_data.lines() { + let fields = line.trim().split(";").collect::<Vec<_>>(); + if fields.len() > FIELD_CATEGORY { + for idx in hexrange_to_range(fields[FIELD_CODEPOINT]) { + codepoints[idx].category = fields[FIELD_CATEGORY]; + } + } + } + } + + fn set_eaw_widths(codepoints: &mut Vec<Codepoint<'_>>, eaw_data_lines: &str) { + // Read from EastAsianWidth.txt, set width values on the codepoints + for line in eaw_data_lines.lines() { + let line = line.trim().split('#').next().unwrap_or(line); + let fields = line.trim().split(';').collect::<Vec<_>>(); + if fields.len() != 2 { + continue; + } + let hexrange = fields[0]; + let width_type = fields[1]; + // width_types: + // A: ambiguous, F: fullwidth, H: halfwidth, + // . N: neutral, Na: east-asian Narrow + let width: isize = if width_type == "A" { + WIDTH_AMBIGUOUS_EASTASIAN + } else if width_type == "F" || width_type == "W" { + 2 + } else { + 1 + }; + for cp in hexrange_to_range(hexrange) { + codepoints[cp].width = Some(width); + } + } + // Apply the following special cases: + // - The unassigned code points in the following blocks default to "W": + // CJK Unified Ideographs Extension A: U+3400..U+4DBF + // CJK Unified Ideographs: U+4E00..U+9FFF + // CJK Compatibility Ideographs: U+F900..U+FAFF + // - All undesignated code points in Planes 2 and 3, whether inside or + // outside of allocated blocks, default to "W": + // Plane 2: U+20000..U+2FFFD + // Plane 3: U+30000..U+3FFFD + const WIDE_RANGES: [(usize, usize); 5] = [ + (0x3400, 0x4DBF), + (0x4E00, 0x9FFF), + (0xF900, 0xFAFF), + (0x20000, 0x2FFFD), + (0x30000, 0x3FFFD), + ]; + for &wr in WIDE_RANGES.iter() { + for cp in wr.0..(wr.1 + 1) { + if codepoints[cp].width.is_none() { + codepoints[cp].width = Some(2); + } + } + } + } + fn set_emoji_widths(codepoints: &mut Vec<Codepoint<'_>>, emoji_data_lines: &str) { + // Read from emoji-data.txt, set codepoint widths + for line in emoji_data_lines.lines() { + if !line.contains("#") || line.trim().starts_with("#") { + continue; + } + let mut fields = line.trim().split('#').collect::<Vec<_>>(); + if fields.len() != 2 { + continue; + } + let comment = fields.pop().unwrap(); + let fields = fields.pop().unwrap(); + + let hexrange = fields.split(";").next().unwrap(); + + // In later versions of emoji-data.txt there are some "reserved" + // entries that have "NA" instead of a Unicode version number + // of first use, they will now return a zero version instead of + // crashing the script + if comment.trim().starts_with("NA") { + continue; + } + + use std::str::FromStr; + let mut v = comment.trim().split_whitespace().next().unwrap(); + if v.starts_with("E") { + v = &v[1..]; + } + if v.as_bytes() + .get(0) + .map(|c| !c.is_ascii_digit()) + .unwrap_or(true) + { + continue; + } + let mut idx = 1; + while v + .as_bytes() + .get(idx) + .map(|c| c.is_ascii_digit()) + .unwrap_or(false) + { + idx += 1; + } + if v.as_bytes().get(idx).map(|&c| c != b'.').unwrap_or(true) { + continue; + } + idx += 1; + while v + .as_bytes() + .get(idx) + .map(|c| c.is_ascii_digit()) + .unwrap_or(false) + { + idx += 1; + } + v = &v[0..idx]; + + let version = f32::from_str(v).unwrap(); + for cp in hexrange_to_range(hexrange) { + // Don't consider <=1F000 values as emoji. These can only be made + // emoji through the variation selector which interacts terribly + // with wcwidth(). + if cp < 0x1F000 { + continue; + } + // Skip codepoints that are explicitly not wide. + // For example U+1F336 ("Hot Pepper") renders like any emoji but is + // marked as neutral in EAW so has width 1 for some reason. + //if codepoints[cp].width == Some(1) { + // continue; + //} + + // If this emoji was introduced before Unicode 9, then it was widened in 9. + codepoints[cp].width = if version >= 9.0 { + Some(2) + } else { + Some(WIDTH_WIDENED_IN_9) + }; + } + } + } + fn set_hardcoded_ranges(codepoints: &mut Vec<Codepoint<'_>>) { + // Mark private use and surrogate codepoints + // Private use can be determined awkwardly from UnicodeData.txt, + // but we just hard-code them. + // We do not treat "private use high surrogate" as private use + // so as to match wcwidth9(). + const PRIVATE_RANGES: [(usize, usize); 3] = + [(0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)]; + for &(first, last) in PRIVATE_RANGES.iter() { + for idx in first..=last { + codepoints[idx].category = CAT_PRIVATE_USE; + } + } + + const SURROGATE_RANGES: [(usize, usize); 2] = [(0xD800, 0xDBFF), (0xDC00, 0xDFFF)]; + for &(first, last) in SURROGATE_RANGES.iter() { + for idx in first..=last { + codepoints[idx].category = CAT_SURROGATE; + } + } + } + let mut file = File::create(&mod_path)?; file.write_all( br#"/* @@ -110,7 +358,69 @@ pub const LINE_BREAK_RULES: &[(u32, u32, LineBreakClass)] = &[ file.write_all(format!(" (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes()) .unwrap(); } - file.write_all(b"];").unwrap(); + file.write_all(b"];\n").unwrap(); + + for (name, filter) in [ + ( + "ASCII", + Box::new(|c: &&Codepoint| c.raw < 0x7f && c.raw >= 0x20) + as Box<dyn Fn(&&Codepoint) -> bool>, + ), + ( + "PRIVATE", + Box::new(|c: &&Codepoint| c.category == CAT_PRIVATE_USE), + ), + ( + "NONPRINT", + Box::new(|c: &&Codepoint| { + ["Cc", "Cf", "Zl", "Zp", CAT_SURROGATE].contains(&c.category) + }), + ), + ( + "COMBINING", + Box::new(|c: &&Codepoint| ["Mn", "Mc", "Me"].contains(&c.category)), + ), + ("DOUBLEWIDE", Box::new(|c: &&Codepoint| c.width == Some(2))), + ( + "UNASSIGNED", + Box::new(|c: &&Codepoint| c.category == CAT_UNASSIGNED), + ), + ( + "AMBIGUOUS", + Box::new(|c: &&Codepoint| c.width == Some(WIDTH_AMBIGUOUS_EASTASIAN)), + ), + ( + "WIDENEDIN9", + Box::new(|c: &&Codepoint| c.width == Some(WIDTH_WIDENED_IN_9)), + ), + ] + .iter() + { + file.write_all( + format!( + r#" +pub const {}: &[(u32, u32)] = &[ +"#, + name + ) + .as_bytes(), + ) + .unwrap(); + let mut iter = codepoints.iter().filter(filter); + let mut prev = iter.next().unwrap().raw; + let mut a = prev; + for cp in iter { + if prev + 1 != cp.raw { + file.write_all(format!(" (0x{:X}, 0x{:X}),\n", a, prev).as_bytes()) + .unwrap(); + a = cp.raw; + } + prev = cp.raw; + } + file.write_all(format!(" (0x{:X}, 0x{:X}),\n", a, prev).as_bytes()) + .unwrap(); + file.write_all(b"];\n").unwrap(); + } } Ok(()) } diff --git a/melib/src/text_processing/tables.rs b/melib/src/text_processing/tables.rs index ea5120c6..234074bb 100644 --- a/melib/src/text_processing/tables.rs +++ b/melib/src/text_processing/tables.rs @@ -3453,4 +3453,1354 @@ pub const LINE_BREAK_RULES: &[(u32, u32, LineBreakClass)] = &[ (0xE0100, 0xE01EF, CM), (0xF0000, 0xFFFFD, XX), (0x100000, 0x10FFFD, XX), -];
\ No newline at end of file +]; + +pub const ASCII: &[(u32, u32)] = &[ + (0x20, 0x7E), +]; + +pub const PRIVATE: &[(u32, u32)] = &[ + (0xE000, 0xF8FF), + (0xF0000, 0xFFFFD), + (0x100000, 0x10FFFD), +]; + +pub const NONPRINT: &[(u32, u32)] = &[ + (0x0, 0x1F), + (0x7F, 0x9F), + (0xAD, 0xAD), + (0x600, 0x605), + (0x61C, 0x61C), + (0x6DD, 0x6DD), + (0x70F, 0x70F), + (0x8E2, 0x8E2), + (0x180E, 0x180E), + (0x200B, 0x200F), + (0x2028, 0x202E), + (0x2060, 0x2064), + (0x2066, 0x206F), + (0xD800, 0xDFFF), + (0xFEFF, 0xFEFF), + (0xFFF9, 0xFFFB), + (0x110BD, 0x110BD), + (0x110CD, 0x110CD), + (0x13430, 0x13438), + (0x1BCA0, 0x1BCA3), + (0x1D173, 0x1D17A), + (0xE0001, 0xE0001), + (0xE0020, 0xE007F), +]; + +pub const COMBINING: &[(u32, u32)] = &[ + (0x300, 0x36F), + (0x483, 0x489), + (0x591, 0x5BD), + (0x5BF, 0x5BF), + (0x5C1, 0x5C2), + (0x5C4, 0x5C5), + (0x5C7, 0x5C7), + (0x610, 0x61A), + (0x64B, 0x65F), + (0x670, 0x670), + (0x6D6, 0x6DC), + (0x6DF, 0x6E4), + (0x6E7, 0x6E8), + (0x6EA, 0x6ED), + (0x711, 0x711), + (0x730, 0x74A), + (0x7A6, 0x7B0), + (0x7EB, 0x7F3), + (0x7FD, 0x7FD), + (0x816, 0x819), + (0x81B, 0x823), + (0x825, 0x827), + (0x829, 0x82D), + (0x859, 0x85B), + (0x8D3, 0x8E1), + (0x8E3, 0x903), + (0x93A, 0x93C), + (0x93E, 0x94F), + (0x951, 0x957), + (0x962, 0x963), + (0x981, 0x983), + (0x9BC, 0x9BC), + (0x9BE, 0x9C4), + (0x9C7, 0x9C8), + (0x9CB, 0x9CD), + (0x9D7, 0x9D7), + (0x9E2, 0x9E3), + (0x9FE, 0x9FE), + (0xA01, 0xA03), + (0xA3C, 0xA3C), + (0xA3E, 0xA42), + (0xA47, 0xA48), + (0xA4B, 0xA4D), + (0xA51, 0xA51), + (0xA70, 0xA71), + (0xA75, 0xA75), + (0xA81, 0xA83), + (0xABC, 0xABC), + (0xABE, 0xAC5), + (0xAC7, 0xAC9), + (0xACB, 0xACD), + (0xAE2, 0xAE3), + (0xAFA, 0xAFF), + (0xB01, 0xB03), + (0xB3C, 0xB3C), + (0xB3E, 0xB44), + (0xB47, 0xB48), + (0xB4B, 0xB4D), + (0xB55, 0xB57), + (0xB62, 0xB63), + (0xB82, 0xB82), + (0xBBE, 0xBC2), + (0xBC6, 0xBC8), + (0xBCA, 0xBCD), + (0xBD7, 0xBD7), + (0xC00, 0xC04), + (0xC3E, 0xC44), + (0xC46, 0xC48), + (0xC4A, 0xC4D), + (0xC55, 0xC56), + (0xC62, 0xC63), + (0xC81, 0xC83), + (0xCBC, 0xCBC), + (0xCBE, 0xCC4), + (0xCC6, 0xCC8), + (0xCCA, 0xCCD), + (0xCD5, 0xCD6), + (0xCE2, 0xCE3), + (0xD00, 0xD03), + (0xD3B, 0xD3C), + (0xD3E, 0xD44), + (0xD46, 0xD48), + (0xD4A, 0xD4D), + (0xD57, 0xD57), + (0xD62, 0xD63), + (0xD81, 0xD83), + (0xDCA, 0xDCA), + (0xDCF, 0xDD4), + (0xDD6, 0xDD6), + (0xDD8, 0xDDF), + (0xDF2, 0xDF3), + (0xE31, 0xE31), + (0xE34, 0xE3A), + (0xE47, 0xE4E), + (0xEB1, 0xEB1), + (0xEB4, 0xEBC), + (0xEC8, 0xECD), + (0xF18, 0xF19), + (0xF35, 0xF35), + (0xF37, 0xF37), + (0xF39, 0xF39), + (0xF3E, 0xF3F), + (0xF71, 0xF84), + (0xF86, 0xF87), + (0xF8D, 0xF97), + (0xF99, 0xFBC), + (0xFC6, 0xFC6), + (0x102B, 0x103E), + (0x1056, 0x1059), + (0x105E, 0x1060), + (0x1062, 0x1064), + (0x1067, 0x106D), + (0x1071, 0x1074), + (0x1082, 0x108D), + (0x108F, 0x108F), + (0x109A, 0x109D), + (0x135D, 0x135F), + (0x1712, 0x1714), + (0x1732, 0x1734), + (0x1752, 0x1753), + (0x1772, 0x1773), + (0x17B4, 0x17D3), + (0x17DD, 0x17DD), + (0x180B, 0x180D), + (0x1885, 0x1886), + (0x18A9, 0x18A9), + (0x1920, 0x192B), + (0x1930, 0x193B), + (0x1A17, 0x1A1B), + (0x1A55, 0x1A5E), + (0x1A60, 0x1A7C), + (0x1A7F, 0x1A7F), + (0x1AB0, 0x1AC0), + (0x1B00, 0x1B04), + (0x1B34, 0x1B44), + (0x1B6B, 0x1B73), + (0x1B80, 0x1B82), + (0x1BA1, 0x1BAD), + (0x1BE6, 0x1BF3), + (0x1C24, 0x1C37), + (0x1CD0, 0x1CD2), + (0x1CD4, 0x1CE8), + (0x1CED, 0x1CED), + (0x1CF4, 0x1CF4), + (0x1CF7, 0x1CF9), + (0x1DC0, 0x1DF9), + (0x1DFB, 0x1DFF), + (0x20D0, 0x20F0), + (0x2CEF, 0x2CF1), + (0x2D7F, 0x2D7F), + (0x2DE0, 0x2DFF), + (0x302A, 0x302F), + (0x3099, 0x309A), + (0xA66F, 0xA672), + (0xA674, 0xA67D), + (0xA69E, 0xA69F), + (0xA6F0, 0xA6F1), + (0xA802, 0xA802), + (0xA806, 0xA806), + (0xA80B, 0xA80B), + (0xA823, 0xA827), + (0xA82C, 0xA82C), + (0xA880, 0xA881), + (0xA8B4, 0xA8C5), + (0xA8E0, 0xA8F1), + (0xA8FF, 0xA8FF), + (0xA926, 0xA92D), + (0xA947, 0xA953), + (0xA980, 0xA983), + (0xA9B3, 0xA9C0), + (0xA9E5, 0xA9E5), + (0xAA29, 0xAA36), + (0xAA43, 0xAA43), + (0xAA4C, 0xAA4D), + (0xAA7B, 0xAA7D), + (0xAAB0, 0xAAB0), + (0xAAB2, 0xAAB4), + (0xAAB7, 0xAAB8), + (0xAABE, 0xAABF), + (0xAAC1, 0xAAC1), + (0xAAEB, 0xAAEF), + (0xAAF5, 0xAAF6), + (0xABE3, 0xABEA), + (0xABEC, 0xABED), + (0xFB1E, 0xFB1E), + (0xFE00, 0xFE0F), + (0xFE20, 0xFE2F), + (0x101FD, 0x101FD), + (0x102E0, 0x102E0), + (0x10376, 0x1037A), + (0x10A01, 0x10A03), + (0x10A05, 0x10A06), + (0x10A0C, 0x10A0F), + (0x10A38, 0x10A3A), + (0x10A3F, 0x10A3F), + (0x10AE5, 0x10AE6), + (0x10D24, 0x10D27), + (0x10EAB, 0x10EAC), + (0x10F46, 0x10F50), + (0x11000, 0x11002), + (0x11038, 0x11046), + (0x1107F, 0x11082), + (0x110B0, 0x110BA), + (0x11100, 0x11102), + (0x11127, 0x11134), + (0x11145, 0x11146), + (0x11173, 0x11173), + (0x11180, 0x11182), + (0x111B3, 0x111C0), + (0x111C9, 0x111CC), + (0x111CE, 0x111CF), + (0x1122C, 0x11237), + (0x1123E, 0x1123E), + (0x112DF, 0x112EA), + (0x11300, 0x11303), + (0x1133B, 0x1133C), + (0x1133E, 0x11344), + (0x11347, 0x11348), + (0x1134B, 0x1134D), + (0x11357, 0x11357), + (0x11362, 0x11363), + (0x11366, 0x1136C), + (0x11370, 0x11374), + (0x11435, 0x11446), + (0x1145E, 0x1145E), + (0x114B0, 0x114C3), + (0x115AF, 0x115B5), + (0x115B8, 0x115C0), + (0x115DC, 0x115DD), + (0x11630, 0x11640), + (0x116AB, 0x116B7), + (0x1171D, 0x1172B), + (0x1182C, 0x1183A), + (0x11930, 0x11935), + (0x11937, 0x11938), + (0x1193B, 0x1193E), + (0x11940, 0x11940), + (0x11942, 0x11943), + (0x119D1, 0x119D7), + (0x119DA, 0x119E0), + (0x119E4, 0x119E4), + (0x11A01, 0x11A0A), + (0x11A33, 0x11A39), + (0x11A3B, 0x11A3E), + (0x11A47, 0x11A47), + (0x11A51, 0x11A5B), + (0x11A8A, 0x11A99), + (0x11C2F, 0x11C36), + (0x11C38, 0x11C3F), + (0x11C92, 0x11CA7), + (0x11CA9, 0x11CB6), + (0x11D31, 0x11D36), + (0x11D3A, 0x11D3A), + (0x11D3C, 0x11D3D), + (0x11D3F, 0x11D45), + (0x11D47, 0x11D47), + (0x11D8A, 0x11D8E), + (0x11D90, 0x11D91), + (0x11D93, 0x11D97), + (0x11EF3, 0x11EF6), + (0x16AF0, 0x16AF4), + (0x16B30, 0x16B36), + (0x16F4F, 0x16F4F), + (0x16F51, 0x16F87), + (0x16F8F, 0x16F92), + (0x16FE4, 0x16FE4), + (0x16FF0, 0x16FF1), + (0x1BC9D, 0x1BC9E), + (0x1D165, 0x1D169), + (0x1D16D, 0x1D172), + (0x1D17B, 0x1D182), + (0x1D185, 0x1D18B), + (0x1D1AA, 0x1D1AD), + (0x1D242, 0x1D244), + (0x1DA00, 0x1DA36), + (0x1DA3B, 0x1DA6C), + (0x1DA75, 0x1DA75), + (0x1DA84, 0x1DA84), + (0x1DA9B, 0x1DA9F), + (0x1DAA1, 0x1DAAF), + (0x1E000, 0x1E006), + (0x1E008, 0x1E018), + (0x1E01B, 0x1E021), + (0x1E023, 0x1E024), + (0x1E026, 0x1E02A), + (0x1E130, 0x1E136), + (0x1E2EC, 0x1E2EF), + (0x1E8D0, 0x1E8D6), + (0x1E944, 0x1E94A), + (0xE0100, 0xE01EF), +]; + +pub const DOUBLEWIDE: &[(u32, u32)] = &[ + (0x1100, 0x115F), + (0x231A, 0x231B), + (0x2329, 0x232A), + (0x23E9, 0x23EC), + (0x23F0, 0x23F0), + (0x23F3, 0x23F3), + (0x25FD, 0x25FE), + (0x2614, 0x2615), + (0x2648, 0x2653), + (0x267F, 0x267F), + (0x2693, 0x2693), + (0x26A1, 0x26A1), + (0x26AA, 0x26AB), + (0x26BD, 0x26BE), + (0x26C4, 0x26C5), + (0x26CE, 0x26CE), + (0x26D4, 0x26D4), + (0x26EA, 0x26EA), + (0x26F2, 0x26F3), + (0x26F5, 0x26F5), + (0x26FA, 0x26FA), + (0x26FD, 0x26FD), + (0x2705, 0x2705), + (0x270A, 0x270B), + (0x2728, 0x2728), + (0x274C, 0x274C), + (0x274E, 0x274E), + (0x2753, 0x2755), + (0x2757, 0x2757), + (0x2795, 0x2797), + (0x27B0, 0x27B0), + (0x27BF, 0x27BF), + (0x2B1B, 0x2B1C), + (0x2B50, 0x2B50), + (0x2B55, 0x2B55), + (0x2E80, 0x2E99), + (0x2E9B, 0x2EF3), + (0x2F00, 0x2FD5), + (0x2FF0, 0x2FFB), + (0x3000, 0x303E), + (0x3041, 0x3096), + (0x3099, 0x30FF), + (0x3105, 0x312F), + (0x3131, 0x318E), + (0x3190, 0x31E3), + (0x31F0, 0x321E), + (0x3220, 0x3247), + (0x3250, 0x4DBF), + (0x4E00, 0xA48C), + (0xA490, 0xA4C6), + (0xA960, 0xA97C), + (0xAC00, 0xD7A3), + (0xF900, 0xFAFF), + (0xFE10, 0xFE19), + (0xFE30, 0xFE52), + (0xFE54, 0xFE66), + (0xFE68, 0xFE6B), + (0xFF01, 0xFF60), + (0xFFE0, 0xFFE6), + (0x16FE0, 0x16FE4), + (0x16FF0, 0x16FF1), + (0x17000, 0x187F7), + (0x18800, 0x18CD5), + (0x18D00, 0x18D08), + (0x1B000, 0x1B11E), + (0x1B150, 0x1B152), + (0x1B164, 0x1B167), + (0x1B170, 0x1B2FB), + (0x1F200, 0x1F200), + (0x1F210, 0x1F219), + (0x1F21B, 0x1F22E), + (0x1F230, 0x1F231), + (0x1F23B, 0x1F23B), + (0x1F240, 0x1F248), + (0x1F6D5, 0x1F6D7), + (0x1F6F9, 0x1F6FC), + (0x1F7E0, 0x1F7EB), + (0x1F90C, 0x1F90F), + (0x1F93F, 0x1F93F), + (0x1F94D, 0x1F94F), + (0x1F96C, 0x1F978), + (0x1F97A, 0x1F97F), + (0x1F998, 0x1F9BF), + (0x1F9C1, 0x1F9CB), + (0x1F9CD, 0x1F9CF), + (0x1F9E7, 0x1F9FF), + (0x1FA70, 0x1FA74), + (0x1FA78, 0x1FA7A), + (0x1FA80, 0x1FA86), + (0x1FA90, 0x1FAA8), + (0x1FAB0, 0x1FAB6), + (0x1FAC0, 0x1FAC2), + (0x1FAD0, 0x1FAD6), + (0x20000, 0x2FFFD), + (0x30000, 0x3FFFD), +]; + +pub const UNASSIGNED: &[(u32, u32)] = &[ + (0x378, 0x379), + (0x380, 0x383), + (0x38B, 0x38B), + (0x38D, 0x38D), + (0x3A2, 0x3A2), + (0x530, 0x530), + (0x557, 0x558), + (0x58B, 0x58C), + (0x590, 0x590), + (0x5C8, 0x5CF), + (0x5EB, 0x5EE), + (0x5F5, 0x5FF), + (0x61D, 0x61D), + (0x70E, 0x70E), + (0x74B, 0x74C), + (0x7B2, 0x7BF), + (0x7FB, 0x7FC), + (0x82E, 0x82F), + (0x83F, 0x83F), + (0x85C, 0x85D), + (0x85F, 0x85F), + (0x86B, 0x89F), + (0x8B5, 0x8B5), + (0x8C8, 0x8D2), + (0x984, 0x984), + (0x98D, 0x98E), + (0x991, 0x992), + (0x9A9, 0x9A9), + (0x9B1, 0x9B1), + (0x9B3, 0x9B5), + (0x9BA, 0x9BB), + (0x9C5, 0x9C6), + (0x9C9, 0x9CA), + (0x9CF, 0x9D6), + (0x9D8, 0x9DB), + (0x9DE, 0x9DE), + (0x9E4, 0x9E5), + (0x9FF, 0xA00), + (0xA04, 0xA04), + (0xA0B, 0xA0E), + (0xA11, 0xA12), + (0xA29, 0xA29), + (0xA31, 0xA31), + (0xA34, 0xA34), + (0xA37, 0xA37), + (0xA3A, 0xA3B), + (0xA3D, 0xA3D), + (0xA43, 0xA46), + (0xA49, 0xA4A), + (0xA4E, 0xA50), + (0xA52, 0xA58), + (0xA5D, 0xA5D), + (0xA5F, 0xA65), + (0xA77, 0xA80), + (0xA84, 0xA84), + (0xA8E, 0xA8E), + (0xA92, 0xA92), + (0xAA9, 0xAA9), + (0xAB1, 0xAB1), + (0xAB4, 0xAB4), + (0xABA, 0xABB), + (0xAC6, 0xAC6), + (0xACA, 0xACA), + (0xACE, 0xACF), + (0xAD1, 0xADF), + (0xAE4, 0xAE5), + (0xAF2, 0xAF8), + (0xB00, 0xB00), + (0xB04, 0xB04), + (0xB0D, 0xB0E), + (0xB11, 0xB12), + (0xB29, 0xB29), + (0xB31, 0xB31), + (0xB34, 0xB34), + (0xB3A, 0xB3B), + (0xB45, 0xB46), + (0xB49, 0xB4A), + (0xB4E, 0xB54), + (0xB58, 0xB5B), + (0xB5E, 0xB5E), + (0xB64, 0xB65), + (0xB78, 0xB81), + (0xB84, 0xB84), + (0xB8B, 0xB8D), + (0xB91, 0xB91), + (0xB96, 0xB98), + (0xB9B, 0xB9B), + (0xB9D, 0xB9D), + (0xBA0, 0xBA2), + (0xBA5, 0xBA7), + (0xBAB, 0xBAD), + (0xBBA, 0xBBD), + (0xBC3, 0xBC5), + (0xBC9, 0xBC9), + (0xBCE, 0xBCF), + (0xBD1, 0xBD6), + (0xBD8, 0xBE5), + (0xBFB, 0xBFF), + (0xC0D, 0xC0D), + (0xC11, 0xC11), + (0xC29, 0xC29), + (0xC3A, 0xC3C), + (0xC45, 0xC45), + (0xC49, 0xC49), + (0xC4E, 0xC54), + (0xC57, 0xC57), + (0xC5B, 0xC5F), + (0xC64, 0xC65), + (0xC70, 0xC76), + (0xC8D, 0xC8D), + (0xC91, 0xC91), + (0xCA9, 0xCA9), + (0xCB4, 0xCB4), + (0xCBA, 0xCBB), + (0xCC5, 0xCC5), + (0xCC9, 0xCC9), + (0xCCE, 0xCD4), + (0xCD7, 0xCDD), + (0xCDF, 0xCDF), + (0xCE4, 0xCE5), + (0xCF0, 0xCF0), + (0xCF3, 0xCFF), + (0xD0D, 0xD0D), + (0xD11, 0xD11), + (0xD45, 0xD45), + (0xD49, 0xD49), + (0xD50, 0xD53), + (0xD64, 0xD65), + (0xD80, 0xD80), + (0xD84, 0xD84), + (0xD97, 0xD99), + (0xDB2, 0xDB2), + (0xDBC, 0xDBC), + (0xDBE, 0xDBF), + (0xDC7, 0xDC9), + (0xDCB, 0xDCE), + (0xDD5, 0xDD5), + (0xDD7, 0xDD7), + (0xDE0, 0xDE5), + (0xDF0, 0xDF1), + (0xDF5, 0xE00), + (0xE3B, 0xE3E), + (0xE5C, 0xE80), + (0xE83, 0xE83), + (0xE85, 0xE85), + (0xE8B, 0xE8B), + (0xEA4, 0xEA4), + (0xEA6, 0xEA6), + (0xEBE, 0xEBF), + (0xEC5, 0xEC5), + (0xEC7, 0xEC7), + (0xECE, 0xECF), + (0xEDA, 0xEDB), + (0xEE0, 0xEFF), + (0xF48, 0xF48), + (0xF6D, 0xF70), + (0xF98, 0xF98), + (0xFBD, 0xFBD), + (0xFCD, 0xFCD), + (0xFDB, 0xFFF), + (0x10C6, 0x10C6), + (0x10C8, 0x10CC), + (0x10CE, 0x10CF), + (0x1249, 0x1249), + (0x124E, 0x124F), + (0x1257, 0x1257), + (0x1259, 0x1259), + (0x125E, 0x125F), + (0x1289, 0x1289), + (0x128E, 0x128F), + (0x12B1, 0x12B1), + (0x12B6, 0x12B7), + (0x12BF, 0x12BF), + (0x12C1, 0x12C1), + (0x12C6, 0x12C7), + (0x12D7, 0x12D7), + (0x1311, 0x1311), + (0x1316, 0x1317), + (0x135B, 0x135C), + (0x137D, 0x137F), + (0x139A, 0x139F), + (0x13F6, 0x13F7), + (0x13FE, 0x13FF), + (0x169D, 0x169F), + (0x16F9, 0x16FF), + (0x170D, 0x170D), + (0x1715, 0x171F), + (0x1737, 0x173F), + (0x1754, 0x175F), + (0x176D, 0x176D), + (0x1771, 0x1771), + (0x1774, 0x177F), + (0x17DE, 0x17DF), + (0x17EA, 0x17EF), + (0x17FA, 0x17FF), + (0x180F, 0x180F), + (0x181A, 0x181F), + (0x1879, 0x187F), + (0x18AB, 0x18AF), + (0x18F6, 0x18FF), + (0x191F, 0x191F), + (0x192C, 0x192F), + (0x193C, 0x193F), + (0x1941, 0x1943), + (0x196E, 0x196F), + (0x1975, 0x197F), + (0x19AC, 0x19AF), + (0x19CA, 0x19CF), + (0x19DB, 0x19DD), + (0x1A1C, 0x1A1D), + (0x1A5F, 0x1A5F), + (0x1A7D, 0x1A7E), + (0x1A8A, 0x1A8F), + (0x1A9A, 0x1A9F), + (0x1AAE, 0x1AAF), + (0x1AC1, 0x1AFF), + (0x1B4C, 0x1B4F), + (0x1B7D, 0x1B7F), + (0x1BF4, 0x1BFB), + (0x1C38, 0x1C3A), + (0x1C4A, 0x1C4C), + (0x1C89, 0x1C8F), + (0x1CBB, 0x1CBC), + (0x1CC8, 0x1CCF), + (0x1CFB, 0x1CFF), + (0x1DFA, 0x1DFA), + (0x1F16, 0x1F17), + (0x1F1E, 0x1F1F), + (0x1F46, 0x1F47), + (0x1F4E, 0x1F4F), + (0x1F58, 0x1F58), + (0x1F5A, 0x1F5A), + (0x1F5C, 0x1F5C), + (0x1F5E, 0x1F5E), + (0x1F7E, 0x1F7F), + (0x1FB5, 0x1FB5), + (0x1FC5, 0x1FC5), + (0x1FD4, 0x1FD5), + (0x1FDC, 0x1FDC), + (0x1FF0, 0x1FF1), + (0x1FF5, 0x1FF5), + (0x1FFF, 0x1FFF), + (0x2065, 0x2065), + (0x2072, 0x2073), + (0x208F, 0x208F), + (0x209D, 0x209F), + (0x20C0, 0x20CF), + (0x20F1, 0x20FF), + (0x218C, 0x218F), + (0x2427, 0x243F), + (0x244B, 0x245F), + (0x2B74, 0x2B75), + (0x2B96, 0x2B96), + (0x2C2F, 0x2C2F), + (0x2C5F, 0x2C5F), + (0x2CF4, 0x2CF8), + (0x2D26, 0x2D26), + (0x2D28, 0x2D2C), + (0x2D2E, 0x2D2F), + (0x2D68, 0x2D6E), + (0x2D71, 0x2D7E), + (0x2D97, 0x2D9F), + (0x2DA7, 0x2DA7), + (0x2DAF, 0x2DAF), + (0x2DB7, 0x2DB7), + (0x2DBF, 0x2DBF), + (0x2DC7, 0x2DC7), + (0x2DCF, 0x2DCF), + (0x2DD7, 0x2DD7), + (0x2DDF, 0x2DDF), + (0x2E53, 0x2E7F), + (0x2E9A, 0x2E9A), + (0x2EF4, 0x2EFF), + (0x2FD6, 0x2FEF), + (0x2FFC, 0x2FFF), + (0x3040, 0x3040), + (0x3097, 0x3098), + (0x3100, 0x3104), + (0x3130, 0x3130), + (0x318F, 0x318F), + (0x31E4, 0x31EF), + (0x321F, 0x321F), + (0x3401, 0x4DBE), + (0x4E01, 0x9FFB), + (0x9FFD, 0x9FFF), + (0xA48D, 0xA48F), + (0xA4C7, 0xA4CF), + (0xA62C, 0xA63F), + (0xA6F8, 0xA6FF), + (0xA7C0, 0xA7C1), + (0xA7CB, 0xA7F4), + (0xA82D, 0xA82F), + (0xA83A, 0xA83F), + (0xA878, 0xA87F), + (0xA8C6, 0xA8CD), + (0xA8DA, 0xA8DF), + (0xA954, 0xA95E), + (0xA97D, 0xA97F), + (0xA9CE, 0xA9CE), + (0xA9DA, 0xA9DD), + (0xA9FF, 0xA9FF), + (0xAA37, 0xAA3F), + (0xAA4E, 0xAA4F), + (0xAA5A, 0xAA5B), + (0xAAC3, 0xAADA), + (0xAAF7, 0xAB00), + (0xAB07, 0xAB08), + (0xAB0F, 0xAB10), + (0xAB17, 0xAB1F), + (0xAB27, 0xAB27), + (0xAB2F, 0xAB2F), + (0xAB6C, 0xAB6F), + (0xABEE, 0xABEF), + (0xABFA, 0xABFF), + (0xAC01, 0xD7A2), + (0xD7A4, 0xD7AF), + (0xD7C7, 0xD7CA), + (0xD7FC, 0xD7FF), + (0xFA6E, 0xFA6F), + (0xFADA, 0xFAFF), + (0xFB07, 0xFB12), + (0xFB18, 0xFB1C), + (0xFB37, 0xFB37), + (0xFB3D, 0xFB3D), + (0xFB3F, 0xFB3F), + (0xFB42, 0xFB42), + (0xFB45, 0xFB45), + (0xFBC2, 0xFBD2), + (0xFD40, 0xFD4F), + (0xFD90, 0xFD91), + (0xFDC8, 0xFDEF), + (0xFDFE, 0xFDFF), + (0xFE1A, 0xFE1F), + (0xFE53, 0xFE53), + (0xFE67, 0xFE67), + (0xFE6C, 0xFE6F), + (0xFE75, 0xFE75), + (0xFEFD, 0xFEFE), + (0xFF00, 0xFF00), + (0xFFBF, 0xFFC1), + (0xFFC8, 0xFFC9), + (0xFFD0, 0xFFD1), + (0xFFD8, 0xFFD9), + (0xFFDD, 0xFFDF), |