summaryrefslogtreecommitdiffstats
path: root/melib
diff options
context:
space:
mode:
authorManos Pitsidianakis <el13635@mail.ntua.gr>2020-11-21 00:57:27 +0200
committerManos Pitsidianakis <el13635@mail.ntua.gr>2020-11-21 02:09:18 +0200
commit1da6d75b0888d393885104409ce7350881e4ec53 (patch)
treefb8efd6dcb56d85784a2ef01a81a39cf5be36d30 /melib
parenta7c0bca8cef29c9f59beec74fc9b1a6bdfe2b729 (diff)
melib/text_processing: add new wcwidth implementation
Download and parse Unicode data files to judge code point width. Inspired by https://github.com/ridiculousfish/widecharwidth/
Diffstat (limited to 'melib')
-rw-r--r--melib/build.rs318
-rw-r--r--melib/src/text_processing/tables.rs1352
-rw-r--r--melib/src/text_processing/wcwidth.rs549
3 files changed, 1692 insertions, 527 deletions
diff --git a/melib/build.rs b/melib/build.rs
index 17570c0b..7fa5ad96 100644
--- a/melib/build.rs
+++ b/melib/build.rs
@@ -25,6 +25,9 @@ include!("src/text_processing/types.rs");
fn main() -> Result<(), std::io::Error> {
#[cfg(feature = "unicode_algorithms")]
{
+ const MOD_PATH: &str = "src/text_processing/tables.rs";
+ println!("cargo:rerun-if-changed=build.rs");
+ println!("cargo:rerun-if-changed={}", MOD_PATH);
/* Line break tables */
use std::fs::File;
use std::io::prelude::*;
@@ -33,8 +36,15 @@ fn main() -> Result<(), std::io::Error> {
use std::process::{Command, Stdio};
const LINE_BREAK_TABLE_URL: &str =
"http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt";
+ /* Grapheme width tables */
+ const UNICODE_DATA_URL: &str =
+ "http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt";
+ const EAW_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt";
+ const EMOJI_DATA_URL: &str =
+ "https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt";
- let mod_path = Path::new("src/text_processing/tables.rs");
+
+ let mod_path = Path::new(MOD_PATH);
if mod_path.exists() {
eprintln!(
"{} already exists, delete it if you want to replace it.",
@@ -66,17 +76,255 @@ fn main() -> Result<(), std::io::Error> {
let mut codepoint_iter = chars_str.split("..");
let first_codepoint: u32 =
- u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap();
+ u32::from_str_radix(codepoint_iter.next().unwrap(), 16).unwrap();
let sec_codepoint: u32 = codepoint_iter
.next()
- .map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap())
+ .map(|v| u32::from_str_radix(v, 16).unwrap())
.unwrap_or(first_codepoint);
let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2];
line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class)));
}
child.wait()?;
+ let child = Command::new("curl")
+ .args(&["-o", "-", UNICODE_DATA_URL])
+ .stdout(Stdio::piped())
+ .output()?;
+
+ let unicode_data = String::from_utf8_lossy(&child.stdout);
+
+ let child = Command::new("curl")
+ .args(&["-o", "-", EAW_URL])
+ .stdout(Stdio::piped())
+ .output()?;
+
+ let eaw_data = String::from_utf8_lossy(&child.stdout);
+
+ let child = Command::new("curl")
+ .args(&["-o", "-", EMOJI_DATA_URL])
+ .stdout(Stdio::piped())
+ .output()?;
+
+ let emoji_data = String::from_utf8_lossy(&child.stdout);
+
+ const MAX_CODEPOINT: usize = 0x110000;
+ // See https://www.unicode.org/L2/L1999/UnicodeData.html
+ const FIELD_CODEPOINT: usize = 0;
+ const FIELD_CATEGORY: usize = 2;
+ // Ambiguous East Asian characters
+ const WIDTH_AMBIGUOUS_EASTASIAN: isize = -3;
+
+ // Width changed from 1 to 2 in Unicode 9.0
+ const WIDTH_WIDENED_IN_9: isize = -6;
+ // Category for unassigned codepoints.
+ const CAT_UNASSIGNED: &str = "Cn";
+
+ // Category for private use codepoints.
+ const CAT_PRIVATE_USE: &str = "Co";
+
+ // Category for surrogates.
+ const CAT_SURROGATE: &str = "Cs";
+
+ struct Codepoint<'cat> {
+ raw: u32,
+ width: Option<isize>,
+ category: &'cat str,
+ }
+
+ let mut codepoints: Vec<Codepoint> = Vec::with_capacity(MAX_CODEPOINT + 1);
+ for i in 0..=MAX_CODEPOINT {
+ codepoints.push(Codepoint {
+ raw: i as u32,
+ width: None,
+ category: CAT_UNASSIGNED,
+ });
+ }
+
+ set_general_categories(&mut codepoints, &unicode_data);
+ set_eaw_widths(&mut codepoints, &eaw_data);
+ set_emoji_widths(&mut codepoints, &emoji_data);
+ set_hardcoded_ranges(&mut codepoints);
+ fn hexrange_to_range(hexrange: &str) -> std::ops::Range<usize> {
+ /* Given a string like 1F300..1F320 representing an inclusive range,
+ return the range of codepoints.
+ If the string is like 1F321, return a range of just that element.
+ */
+ let hexrange = hexrange.trim();
+ let fields = hexrange
+ .split("..")
+ .map(|h| usize::from_str_radix(h.trim(), 16).unwrap())
+ .collect::<Vec<usize>>();
+ if fields.len() == 1 {
+ fields[0]..(fields[0] + 1)
+ } else {
+ fields[0]..(fields[1] + 1)
+ }
+ }
+
+ fn set_general_categories<'u>(codepoints: &mut Vec<Codepoint<'u>>, unicode_data: &'u str) {
+ for line in unicode_data.lines() {
+ let fields = line.trim().split(";").collect::<Vec<_>>();
+ if fields.len() > FIELD_CATEGORY {
+ for idx in hexrange_to_range(fields[FIELD_CODEPOINT]) {
+ codepoints[idx].category = fields[FIELD_CATEGORY];
+ }
+ }
+ }
+ }
+
+ fn set_eaw_widths(codepoints: &mut Vec<Codepoint<'_>>, eaw_data_lines: &str) {
+ // Read from EastAsianWidth.txt, set width values on the codepoints
+ for line in eaw_data_lines.lines() {
+ let line = line.trim().split('#').next().unwrap_or(line);
+ let fields = line.trim().split(';').collect::<Vec<_>>();
+ if fields.len() != 2 {
+ continue;
+ }
+ let hexrange = fields[0];
+ let width_type = fields[1];
+ // width_types:
+ // A: ambiguous, F: fullwidth, H: halfwidth,
+ // . N: neutral, Na: east-asian Narrow
+ let width: isize = if width_type == "A" {
+ WIDTH_AMBIGUOUS_EASTASIAN
+ } else if width_type == "F" || width_type == "W" {
+ 2
+ } else {
+ 1
+ };
+ for cp in hexrange_to_range(hexrange) {
+ codepoints[cp].width = Some(width);
+ }
+ }
+ // Apply the following special cases:
+ // - The unassigned code points in the following blocks default to "W":
+ // CJK Unified Ideographs Extension A: U+3400..U+4DBF
+ // CJK Unified Ideographs: U+4E00..U+9FFF
+ // CJK Compatibility Ideographs: U+F900..U+FAFF
+ // - All undesignated code points in Planes 2 and 3, whether inside or
+ // outside of allocated blocks, default to "W":
+ // Plane 2: U+20000..U+2FFFD
+ // Plane 3: U+30000..U+3FFFD
+ const WIDE_RANGES: [(usize, usize); 5] = [
+ (0x3400, 0x4DBF),
+ (0x4E00, 0x9FFF),
+ (0xF900, 0xFAFF),
+ (0x20000, 0x2FFFD),
+ (0x30000, 0x3FFFD),
+ ];
+ for &wr in WIDE_RANGES.iter() {
+ for cp in wr.0..(wr.1 + 1) {
+ if codepoints[cp].width.is_none() {
+ codepoints[cp].width = Some(2);
+ }
+ }
+ }
+ }
+ fn set_emoji_widths(codepoints: &mut Vec<Codepoint<'_>>, emoji_data_lines: &str) {
+ // Read from emoji-data.txt, set codepoint widths
+ for line in emoji_data_lines.lines() {
+ if !line.contains("#") || line.trim().starts_with("#") {
+ continue;
+ }
+ let mut fields = line.trim().split('#').collect::<Vec<_>>();
+ if fields.len() != 2 {
+ continue;
+ }
+ let comment = fields.pop().unwrap();
+ let fields = fields.pop().unwrap();
+
+ let hexrange = fields.split(";").next().unwrap();
+
+ // In later versions of emoji-data.txt there are some "reserved"
+ // entries that have "NA" instead of a Unicode version number
+ // of first use, they will now return a zero version instead of
+ // crashing the script
+ if comment.trim().starts_with("NA") {
+ continue;
+ }
+
+ use std::str::FromStr;
+ let mut v = comment.trim().split_whitespace().next().unwrap();
+ if v.starts_with("E") {
+ v = &v[1..];
+ }
+ if v.as_bytes()
+ .get(0)
+ .map(|c| !c.is_ascii_digit())
+ .unwrap_or(true)
+ {
+ continue;
+ }
+ let mut idx = 1;
+ while v
+ .as_bytes()
+ .get(idx)
+ .map(|c| c.is_ascii_digit())
+ .unwrap_or(false)
+ {
+ idx += 1;
+ }
+ if v.as_bytes().get(idx).map(|&c| c != b'.').unwrap_or(true) {
+ continue;
+ }
+ idx += 1;
+ while v
+ .as_bytes()
+ .get(idx)
+ .map(|c| c.is_ascii_digit())
+ .unwrap_or(false)
+ {
+ idx += 1;
+ }
+ v = &v[0..idx];
+
+ let version = f32::from_str(v).unwrap();
+ for cp in hexrange_to_range(hexrange) {
+ // Don't consider <=1F000 values as emoji. These can only be made
+ // emoji through the variation selector which interacts terribly
+ // with wcwidth().
+ if cp < 0x1F000 {
+ continue;
+ }
+ // Skip codepoints that are explicitly not wide.
+ // For example U+1F336 ("Hot Pepper") renders like any emoji but is
+ // marked as neutral in EAW so has width 1 for some reason.
+ //if codepoints[cp].width == Some(1) {
+ // continue;
+ //}
+
+ // If this emoji was introduced before Unicode 9, then it was widened in 9.
+ codepoints[cp].width = if version >= 9.0 {
+ Some(2)
+ } else {
+ Some(WIDTH_WIDENED_IN_9)
+ };
+ }
+ }
+ }
+ fn set_hardcoded_ranges(codepoints: &mut Vec<Codepoint<'_>>) {
+ // Mark private use and surrogate codepoints
+ // Private use can be determined awkwardly from UnicodeData.txt,
+ // but we just hard-code them.
+ // We do not treat "private use high surrogate" as private use
+ // so as to match wcwidth9().
+ const PRIVATE_RANGES: [(usize, usize); 3] =
+ [(0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)];
+ for &(first, last) in PRIVATE_RANGES.iter() {
+ for idx in first..=last {
+ codepoints[idx].category = CAT_PRIVATE_USE;
+ }
+ }
+
+ const SURROGATE_RANGES: [(usize, usize); 2] = [(0xD800, 0xDBFF), (0xDC00, 0xDFFF)];
+ for &(first, last) in SURROGATE_RANGES.iter() {
+ for idx in first..=last {
+ codepoints[idx].category = CAT_SURROGATE;
+ }
+ }
+ }
+
let mut file = File::create(&mod_path)?;
file.write_all(
br#"/*
@@ -110,7 +358,69 @@ pub const LINE_BREAK_RULES: &[(u32, u32, LineBreakClass)] = &[
file.write_all(format!(" (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes())
.unwrap();
}
- file.write_all(b"];").unwrap();
+ file.write_all(b"];\n").unwrap();
+
+ for (name, filter) in [
+ (
+ "ASCII",
+ Box::new(|c: &&Codepoint| c.raw < 0x7f && c.raw >= 0x20)
+ as Box<dyn Fn(&&Codepoint) -> bool>,
+ ),
+ (
+ "PRIVATE",
+ Box::new(|c: &&Codepoint| c.category == CAT_PRIVATE_USE),
+ ),
+ (
+ "NONPRINT",
+ Box::new(|c: &&Codepoint| {
+ ["Cc", "Cf", "Zl", "Zp", CAT_SURROGATE].contains(&c.category)
+ }),
+ ),
+ (
+ "COMBINING",
+ Box::new(|c: &&Codepoint| ["Mn", "Mc", "Me"].contains(&c.category)),
+ ),
+ ("DOUBLEWIDE", Box::new(|c: &&Codepoint| c.width == Some(2))),
+ (
+ "UNASSIGNED",
+ Box::new(|c: &&Codepoint| c.category == CAT_UNASSIGNED),
+ ),
+ (
+ "AMBIGUOUS",
+ Box::new(|c: &&Codepoint| c.width == Some(WIDTH_AMBIGUOUS_EASTASIAN)),
+ ),
+ (
+ "WIDENEDIN9",
+ Box::new(|c: &&Codepoint| c.width == Some(WIDTH_WIDENED_IN_9)),
+ ),
+ ]
+ .iter()
+ {
+ file.write_all(
+ format!(
+ r#"
+pub const {}: &[(u32, u32)] = &[
+"#,
+ name
+ )
+ .as_bytes(),
+ )
+ .unwrap();
+ let mut iter = codepoints.iter().filter(filter);
+ let mut prev = iter.next().unwrap().raw;
+ let mut a = prev;
+ for cp in iter {
+ if prev + 1 != cp.raw {
+ file.write_all(format!(" (0x{:X}, 0x{:X}),\n", a, prev).as_bytes())
+ .unwrap();
+ a = cp.raw;
+ }
+ prev = cp.raw;
+ }
+ file.write_all(format!(" (0x{:X}, 0x{:X}),\n", a, prev).as_bytes())
+ .unwrap();
+ file.write_all(b"];\n").unwrap();
+ }
}
Ok(())
}
diff --git a/melib/src/text_processing/tables.rs b/melib/src/text_processing/tables.rs
index ea5120c6..234074bb 100644
--- a/melib/src/text_processing/tables.rs
+++ b/melib/src/text_processing/tables.rs
@@ -3453,4 +3453,1354 @@ pub const LINE_BREAK_RULES: &[(u32, u32, LineBreakClass)] = &[
(0xE0100, 0xE01EF, CM),
(0xF0000, 0xFFFFD, XX),
(0x100000, 0x10FFFD, XX),
-]; \ No newline at end of file
+];
+
+pub const ASCII: &[(u32, u32)] = &[
+ (0x20, 0x7E),
+];
+
+pub const PRIVATE: &[(u32, u32)] = &[
+ (0xE000, 0xF8FF),
+ (0xF0000, 0xFFFFD),
+ (0x100000, 0x10FFFD),
+];
+
+pub const NONPRINT: &[(u32, u32)] = &[
+ (0x0, 0x1F),
+ (0x7F, 0x9F),
+ (0xAD, 0xAD),
+ (0x600, 0x605),
+ (0x61C, 0x61C),
+ (0x6DD, 0x6DD),
+ (0x70F, 0x70F),
+ (0x8E2, 0x8E2),
+ (0x180E, 0x180E),
+ (0x200B, 0x200F),
+ (0x2028, 0x202E),
+ (0x2060, 0x2064),
+ (0x2066, 0x206F),
+ (0xD800, 0xDFFF),
+ (0xFEFF, 0xFEFF),
+ (0xFFF9, 0xFFFB),
+ (0x110BD, 0x110BD),
+ (0x110CD, 0x110CD),
+ (0x13430, 0x13438),
+ (0x1BCA0, 0x1BCA3),
+ (0x1D173, 0x1D17A),
+ (0xE0001, 0xE0001),
+ (0xE0020, 0xE007F),
+];
+
+pub const COMBINING: &[(u32, u32)] = &[
+ (0x300, 0x36F),
+ (0x483, 0x489),
+ (0x591, 0x5BD),
+ (0x5BF, 0x5BF),
+ (0x5C1, 0x5C2),
+ (0x5C4, 0x5C5),
+ (0x5C7, 0x5C7),
+ (0x610, 0x61A),
+ (0x64B, 0x65F),
+ (0x670, 0x670),
+ (0x6D6, 0x6DC),
+ (0x6DF, 0x6E4),
+ (0x6E7, 0x6E8),
+ (0x6EA, 0x6ED),
+ (0x711, 0x711),
+ (0x730, 0x74A),
+ (0x7A6, 0x7B0),
+ (0x7EB, 0x7F3),
+ (0x7FD, 0x7FD),
+ (0x816, 0x819),
+ (0x81B, 0x823),
+ (0x825, 0x827),
+ (0x829, 0x82D),
+ (0x859, 0x85B),
+ (0x8D3, 0x8E1),
+ (0x8E3, 0x903),
+ (0x93A, 0x93C),
+ (0x93E, 0x94F),
+ (0x951, 0x957),
+ (0x962, 0x963),
+ (0x981, 0x983),
+ (0x9BC, 0x9BC),
+ (0x9BE, 0x9C4),
+ (0x9C7, 0x9C8),
+ (0x9CB, 0x9CD),
+ (0x9D7, 0x9D7),
+ (0x9E2, 0x9E3),
+ (0x9FE, 0x9FE),
+ (0xA01, 0xA03),
+ (0xA3C, 0xA3C),
+ (0xA3E, 0xA42),
+ (0xA47, 0xA48),
+ (0xA4B, 0xA4D),
+ (0xA51, 0xA51),
+ (0xA70, 0xA71),
+ (0xA75, 0xA75),
+ (0xA81, 0xA83),
+ (0xABC, 0xABC),
+ (0xABE, 0xAC5),
+ (0xAC7, 0xAC9),
+ (0xACB, 0xACD),
+ (0xAE2, 0xAE3),
+ (0xAFA, 0xAFF),
+ (0xB01, 0xB03),
+ (0xB3C, 0xB3C),
+ (0xB3E, 0xB44),
+ (0xB47, 0xB48),
+ (0xB4B, 0xB4D),
+ (0xB55, 0xB57),
+ (0xB62, 0xB63),
+ (0xB82, 0xB82),
+ (0xBBE, 0xBC2),
+ (0xBC6, 0xBC8),
+ (0xBCA, 0xBCD),
+ (0xBD7, 0xBD7),
+ (0xC00, 0xC04),
+ (0xC3E, 0xC44),
+ (0xC46, 0xC48),
+ (0xC4A, 0xC4D),
+ (0xC55, 0xC56),
+ (0xC62, 0xC63),
+ (0xC81, 0xC83),
+ (0xCBC, 0xCBC),
+ (0xCBE, 0xCC4),
+ (0xCC6, 0xCC8),
+ (0xCCA, 0xCCD),
+ (0xCD5, 0xCD6),
+ (0xCE2, 0xCE3),
+ (0xD00, 0xD03),
+ (0xD3B, 0xD3C),
+ (0xD3E, 0xD44),
+ (0xD46, 0xD48),
+ (0xD4A, 0xD4D),
+ (0xD57, 0xD57),
+ (0xD62, 0xD63),
+ (0xD81, 0xD83),
+ (0xDCA, 0xDCA),
+ (0xDCF, 0xDD4),
+ (0xDD6, 0xDD6),
+ (0xDD8, 0xDDF),
+ (0xDF2, 0xDF3),
+ (0xE31, 0xE31),
+ (0xE34, 0xE3A),
+ (0xE47, 0xE4E),
+ (0xEB1, 0xEB1),
+ (0xEB4, 0xEBC),
+ (0xEC8, 0xECD),
+ (0xF18, 0xF19),
+ (0xF35, 0xF35),
+ (0xF37, 0xF37),
+ (0xF39, 0xF39),
+ (0xF3E, 0xF3F),
+ (0xF71, 0xF84),
+ (0xF86, 0xF87),
+ (0xF8D, 0xF97),
+ (0xF99, 0xFBC),
+ (0xFC6, 0xFC6),
+ (0x102B, 0x103E),
+ (0x1056, 0x1059),
+ (0x105E, 0x1060),
+ (0x1062, 0x1064),
+ (0x1067, 0x106D),
+ (0x1071, 0x1074),
+ (0x1082, 0x108D),
+ (0x108F, 0x108F),
+ (0x109A, 0x109D),
+ (0x135D, 0x135F),
+ (0x1712, 0x1714),
+ (0x1732, 0x1734),
+ (0x1752, 0x1753),
+ (0x1772, 0x1773),
+ (0x17B4, 0x17D3),
+ (0x17DD, 0x17DD),
+ (0x180B, 0x180D),
+ (0x1885, 0x1886),
+ (0x18A9, 0x18A9),
+ (0x1920, 0x192B),
+ (0x1930, 0x193B),
+ (0x1A17, 0x1A1B),
+ (0x1A55, 0x1A5E),
+ (0x1A60, 0x1A7C),
+ (0x1A7F, 0x1A7F),
+ (0x1AB0, 0x1AC0),
+ (0x1B00, 0x1B04),
+ (0x1B34, 0x1B44),
+ (0x1B6B, 0x1B73),
+ (0x1B80, 0x1B82),
+ (0x1BA1, 0x1BAD),
+ (0x1BE6, 0x1BF3),
+ (0x1C24, 0x1C37),
+ (0x1CD0, 0x1CD2),
+ (0x1CD4, 0x1CE8),
+ (0x1CED, 0x1CED),
+ (0x1CF4, 0x1CF4),
+ (0x1CF7, 0x1CF9),
+ (0x1DC0, 0x1DF9),
+ (0x1DFB, 0x1DFF),
+ (0x20D0, 0x20F0),
+ (0x2CEF, 0x2CF1),
+ (0x2D7F, 0x2D7F),
+ (0x2DE0, 0x2DFF),
+ (0x302A, 0x302F),
+ (0x3099, 0x309A),
+ (0xA66F, 0xA672),
+ (0xA674, 0xA67D),
+ (0xA69E, 0xA69F),
+ (0xA6F0, 0xA6F1),
+ (0xA802, 0xA802),
+ (0xA806, 0xA806),
+ (0xA80B, 0xA80B),
+ (0xA823, 0xA827),
+ (0xA82C, 0xA82C),
+ (0xA880, 0xA881),
+ (0xA8B4, 0xA8C5),
+ (0xA8E0, 0xA8F1),
+ (0xA8FF, 0xA8FF),
+ (0xA926, 0xA92D),
+ (0xA947, 0xA953),
+ (0xA980, 0xA983),
+ (0xA9B3, 0xA9C0),
+ (0xA9E5, 0xA9E5),
+ (0xAA29, 0xAA36),
+ (0xAA43, 0xAA43),
+ (0xAA4C, 0xAA4D),
+ (0xAA7B, 0xAA7D),
+ (0xAAB0, 0xAAB0),
+ (0xAAB2, 0xAAB4),
+ (0xAAB7, 0xAAB8),
+ (0xAABE, 0xAABF),
+ (0xAAC1, 0xAAC1),
+ (0xAAEB, 0xAAEF),
+ (0xAAF5, 0xAAF6),
+ (0xABE3, 0xABEA),
+ (0xABEC, 0xABED),
+ (0xFB1E, 0xFB1E),
+ (0xFE00, 0xFE0F),
+ (0xFE20, 0xFE2F),
+ (0x101FD, 0x101FD),
+ (0x102E0, 0x102E0),
+ (0x10376, 0x1037A),
+ (0x10A01, 0x10A03),
+ (0x10A05, 0x10A06),
+ (0x10A0C, 0x10A0F),
+ (0x10A38, 0x10A3A),
+ (0x10A3F, 0x10A3F),
+ (0x10AE5, 0x10AE6),
+ (0x10D24, 0x10D27),
+ (0x10EAB, 0x10EAC),
+ (0x10F46, 0x10F50),
+ (0x11000, 0x11002),
+ (0x11038, 0x11046),
+ (0x1107F, 0x11082),
+ (0x110B0, 0x110BA),
+ (0x11100, 0x11102),
+ (0x11127, 0x11134),
+ (0x11145, 0x11146),
+ (0x11173, 0x11173),
+ (0x11180, 0x11182),
+ (0x111B3, 0x111C0),
+ (0x111C9, 0x111CC),
+ (0x111CE, 0x111CF),
+ (0x1122C, 0x11237),
+ (0x1123E, 0x1123E),
+ (0x112DF, 0x112EA),
+ (0x11300, 0x11303),
+ (0x1133B, 0x1133C),
+ (0x1133E, 0x11344),
+ (0x11347, 0x11348),
+ (0x1134B, 0x1134D),
+ (0x11357, 0x11357),
+ (0x11362, 0x11363),
+ (0x11366, 0x1136C),
+ (0x11370, 0x11374),
+ (0x11435, 0x11446),
+ (0x1145E, 0x1145E),
+ (0x114B0, 0x114C3),
+ (0x115AF, 0x115B5),
+ (0x115B8, 0x115C0),
+ (0x115DC, 0x115DD),
+ (0x11630, 0x11640),
+ (0x116AB, 0x116B7),
+ (0x1171D, 0x1172B),
+ (0x1182C, 0x1183A),
+ (0x11930, 0x11935),
+ (0x11937, 0x11938),
+ (0x1193B, 0x1193E),
+ (0x11940, 0x11940),
+ (0x11942, 0x11943),
+ (0x119D1, 0x119D7),
+ (0x119DA, 0x119E0),
+ (0x119E4, 0x119E4),
+ (0x11A01, 0x11A0A),
+ (0x11A33, 0x11A39),
+ (0x11A3B, 0x11A3E),
+ (0x11A47, 0x11A47),
+ (0x11A51, 0x11A5B),
+ (0x11A8A, 0x11A99),
+ (0x11C2F, 0x11C36),
+ (0x11C38, 0x11C3F),
+ (0x11C92, 0x11CA7),
+ (0x11CA9, 0x11CB6),
+ (0x11D31, 0x11D36),
+ (0x11D3A, 0x11D3A),
+ (0x11D3C, 0x11D3D),
+ (0x11D3F, 0x11D45),
+ (0x11D47, 0x11D47),
+ (0x11D8A, 0x11D8E),
+ (0x11D90, 0x11D91),
+ (0x11D93, 0x11D97),
+ (0x11EF3, 0x11EF6),
+ (0x16AF0, 0x16AF4),
+ (0x16B30, 0x16B36),
+ (0x16F4F, 0x16F4F),
+ (0x16F51, 0x16F87),
+ (0x16F8F, 0x16F92),
+ (0x16FE4, 0x16FE4),
+ (0x16FF0, 0x16FF1),
+ (0x1BC9D, 0x1BC9E),
+ (0x1D165, 0x1D169),
+ (0x1D16D, 0x1D172),
+ (0x1D17B, 0x1D182),
+ (0x1D185, 0x1D18B),
+ (0x1D1AA, 0x1D1AD),
+ (0x1D242, 0x1D244),
+ (0x1DA00, 0x1DA36),
+ (0x1DA3B, 0x1DA6C),
+ (0x1DA75, 0x1DA75),
+ (0x1DA84, 0x1DA84),
+ (0x1DA9B, 0x1DA9F),
+ (0x1DAA1, 0x1DAAF),
+ (0x1E000, 0x1E006),
+ (0x1E008, 0x1E018),
+ (0x1E01B, 0x1E021),
+ (0x1E023, 0x1E024),
+ (0x1E026, 0x1E02A),
+ (0x1E130, 0x1E136),
+ (0x1E2EC, 0x1E2EF),
+ (0x1E8D0, 0x1E8D6),
+ (0x1E944, 0x1E94A),
+ (0xE0100, 0xE01EF),
+];
+
+pub const DOUBLEWIDE: &[(u32, u32)] = &[
+ (0x1100, 0x115F),
+ (0x231A, 0x231B),
+ (0x2329, 0x232A),
+ (0x23E9, 0x23EC),
+ (0x23F0, 0x23F0),
+ (0x23F3, 0x23F3),
+ (0x25FD, 0x25FE),
+ (0x2614, 0x2615),
+ (0x2648, 0x2653),
+ (0x267F, 0x267F),
+ (0x2693, 0x2693),
+ (0x26A1, 0x26A1),
+ (0x26AA, 0x26AB),
+ (0x26BD, 0x26BE),
+ (0x26C4, 0x26C5),
+ (0x26CE, 0x26CE),
+ (0x26D4, 0x26D4),
+ (0x26EA, 0x26EA),
+ (0x26F2, 0x26F3),
+ (0x26F5, 0x26F5),
+ (0x26FA, 0x26FA),
+ (0x26FD, 0x26FD),
+ (0x2705, 0x2705),
+ (0x270A, 0x270B),
+ (0x2728, 0x2728),
+ (0x274C, 0x274C),
+ (0x274E, 0x274E),
+ (0x2753, 0x2755),
+ (0x2757, 0x2757),
+ (0x2795, 0x2797),
+ (0x27B0, 0x27B0),
+ (0x27BF, 0x27BF),
+ (0x2B1B, 0x2B1C),
+ (0x2B50, 0x2B50),
+ (0x2B55, 0x2B55),
+ (0x2E80, 0x2E99),
+ (0x2E9B, 0x2EF3),
+ (0x2F00, 0x2FD5),
+ (0x2FF0, 0x2FFB),
+ (0x3000, 0x303E),
+ (0x3041, 0x3096),
+ (0x3099, 0x30FF),
+ (0x3105, 0x312F),
+ (0x3131, 0x318E),
+ (0x3190, 0x31E3),
+ (0x31F0, 0x321E),
+ (0x3220, 0x3247),
+ (0x3250, 0x4DBF),
+ (0x4E00, 0xA48C),
+ (0xA490, 0xA4C6),
+ (0xA960, 0xA97C),
+ (0xAC00, 0xD7A3),
+ (0xF900, 0xFAFF),
+ (0xFE10, 0xFE19),
+ (0xFE30, 0xFE52),
+ (0xFE54, 0xFE66),
+ (0xFE68, 0xFE6B),
+ (0xFF01, 0xFF60),
+ (0xFFE0, 0xFFE6),
+ (0x16FE0, 0x16FE4),
+ (0x16FF0, 0x16FF1),
+ (0x17000, 0x187F7),
+ (0x18800, 0x18CD5),
+ (0x18D00, 0x18D08),
+ (0x1B000, 0x1B11E),
+ (0x1B150, 0x1B152),
+ (0x1B164, 0x1B167),
+ (0x1B170, 0x1B2FB),
+ (0x1F200, 0x1F200),
+ (0x1F210, 0x1F219),
+ (0x1F21B, 0x1F22E),
+ (0x1F230, 0x1F231),
+ (0x1F23B, 0x1F23B),
+ (0x1F240, 0x1F248),
+ (0x1F6D5, 0x1F6D7),
+ (0x1F6F9, 0x1F6FC),
+ (0x1F7E0, 0x1F7EB),
+ (0x1F90C, 0x1F90F),
+ (0x1F93F, 0x1F93F),
+ (0x1F94D, 0x1F94F),
+ (0x1F96C, 0x1F978),
+ (0x1F97A, 0x1F97F),
+ (0x1F998, 0x1F9BF),
+ (0x1F9C1, 0x1F9CB),
+ (0x1F9CD, 0x1F9CF),
+ (0x1F9E7, 0x1F9FF),
+ (0x1FA70, 0x1FA74),
+ (0x1FA78, 0x1FA7A),
+ (0x1FA80, 0x1FA86),
+ (0x1FA90, 0x1FAA8),
+ (0x1FAB0, 0x1FAB6),
+ (0x1FAC0, 0x1FAC2),
+ (0x1FAD0, 0x1FAD6),
+ (0x20000, 0x2FFFD),
+ (0x30000, 0x3FFFD),
+];
+
+pub const UNASSIGNED: &[(u32, u32)] = &[
+ (0x378, 0x379),
+ (0x380, 0x383),
+ (0x38B, 0x38B),
+ (0x38D, 0x38D),
+ (0x3A2, 0x3A2),
+ (0x530, 0x530),
+ (0x557, 0x558),
+ (0x58B, 0x58C),
+ (0x590, 0x590),
+ (0x5C8, 0x5CF),
+ (0x5EB, 0x5EE),
+ (0x5F5, 0x5FF),
+ (0x61D, 0x61D),
+ (0x70E, 0x70E),
+ (0x74B, 0x74C),
+ (0x7B2, 0x7BF),
+ (0x7FB, 0x7FC),
+ (0x82E, 0x82F),
+ (0x83F, 0x83F),
+ (0x85C, 0x85D),
+ (0x85F, 0x85F),
+ (0x86B, 0x89F),
+ (0x8B5, 0x8B5),
+ (0x8C8, 0x8D2),
+ (0x984, 0x984),
+ (0x98D, 0x98E),
+ (0x991, 0x992),
+ (0x9A9, 0x9A9),
+ (0x9B1, 0x9B1),
+ (0x9B3, 0x9B5),
+ (0x9BA, 0x9BB),
+ (0x9C5, 0x9C6),
+ (0x9C9, 0x9CA),
+ (0x9CF, 0x9D6),
+ (0x9D8, 0x9DB),
+ (0x9DE, 0x9DE),
+ (0x9E4, 0x9E5),
+ (0x9FF, 0xA00),
+ (0xA04, 0xA04),
+ (0xA0B, 0xA0E),
+ (0xA11, 0xA12),
+ (0xA29, 0xA29),
+ (0xA31, 0xA31),
+ (0xA34, 0xA34),
+ (0xA37, 0xA37),
+ (0xA3A, 0xA3B),
+ (0xA3D, 0xA3D),
+ (0xA43, 0xA46),
+ (0xA49, 0xA4A),
+ (0xA4E, 0xA50),
+ (0xA52, 0xA58),
+ (0xA5D, 0xA5D),
+ (0xA5F, 0xA65),
+ (0xA77, 0xA80),
+ (0xA84, 0xA84),
+ (0xA8E, 0xA8E),
+ (0xA92, 0xA92),
+ (0xAA9, 0xAA9),
+ (0xAB1, 0xAB1),
+ (0xAB4, 0xAB4),
+ (0xABA, 0xABB),
+ (0xAC6, 0xAC6),
+ (0xACA, 0xACA),
+ (0xACE, 0xACF),
+ (0xAD1, 0xADF),
+ (0xAE4, 0xAE5),
+ (0xAF2, 0xAF8),
+ (0xB00, 0xB00),
+ (0xB04, 0xB04),
+ (0xB0D, 0xB0E),
+ (0xB11, 0xB12),
+ (0xB29, 0xB29),
+ (0xB31, 0xB31),
+ (0xB34, 0xB34),
+ (0xB3A, 0xB3B),
+ (0xB45, 0xB46),
+ (0xB49, 0xB4A),
+ (0xB4E, 0xB54),
+ (0xB58, 0xB5B),
+ (0xB5E, 0xB5E),
+ (0xB64, 0xB65),
+ (0xB78, 0xB81),
+ (0xB84, 0xB84),
+ (0xB8B, 0xB8D),
+ (0xB91, 0xB91),
+ (0xB96, 0xB98),
+ (0xB9B, 0xB9B),
+ (0xB9D, 0xB9D),
+ (0xBA0, 0xBA2),
+ (0xBA5, 0xBA7),
+ (0xBAB, 0xBAD),
+ (0xBBA, 0xBBD),
+ (0xBC3, 0xBC5),
+ (0xBC9, 0xBC9),
+ (0xBCE, 0xBCF),
+ (0xBD1, 0xBD6),
+ (0xBD8, 0xBE5),
+ (0xBFB, 0xBFF),
+ (0xC0D, 0xC0D),
+ (0xC11, 0xC11),
+ (0xC29, 0xC29),
+ (0xC3A, 0xC3C),
+ (0xC45, 0xC45),
+ (0xC49, 0xC49),
+ (0xC4E, 0xC54),
+ (0xC57, 0xC57),
+ (0xC5B, 0xC5F),
+ (0xC64, 0xC65),
+ (0xC70, 0xC76),
+ (0xC8D, 0xC8D),
+ (0xC91, 0xC91),
+ (0xCA9, 0xCA9),
+ (0xCB4, 0xCB4),
+ (0xCBA, 0xCBB),
+ (0xCC5, 0xCC5),
+ (0xCC9, 0xCC9),
+ (0xCCE, 0xCD4),
+ (0xCD7, 0xCDD),
+ (0xCDF, 0xCDF),
+ (0xCE4, 0xCE5),
+ (0xCF0, 0xCF0),
+ (0xCF3, 0xCFF),
+ (0xD0D, 0xD0D),
+ (0xD11, 0xD11),
+ (0xD45, 0xD45),
+ (0xD49, 0xD49),
+ (0xD50, 0xD53),
+ (0xD64, 0xD65),
+ (0xD80, 0xD80),
+ (0xD84, 0xD84),
+ (0xD97, 0xD99),
+ (0xDB2, 0xDB2),
+ (0xDBC, 0xDBC),
+ (0xDBE, 0xDBF),
+ (0xDC7, 0xDC9),
+ (0xDCB, 0xDCE),
+ (0xDD5, 0xDD5),
+ (0xDD7, 0xDD7),
+ (0xDE0, 0xDE5),
+ (0xDF0, 0xDF1),
+ (0xDF5, 0xE00),
+ (0xE3B, 0xE3E),
+ (0xE5C, 0xE80),
+ (0xE83, 0xE83),
+ (0xE85, 0xE85),
+ (0xE8B, 0xE8B),
+ (0xEA4, 0xEA4),
+ (0xEA6, 0xEA6),
+ (0xEBE, 0xEBF),
+ (0xEC5, 0xEC5),
+ (0xEC7, 0xEC7),
+ (0xECE, 0xECF),
+ (0xEDA, 0xEDB),
+ (0xEE0, 0xEFF),
+ (0xF48, 0xF48),
+ (0xF6D, 0xF70),
+ (0xF98, 0xF98),
+ (0xFBD, 0xFBD),
+ (0xFCD, 0xFCD),
+ (0xFDB, 0xFFF),
+ (0x10C6, 0x10C6),
+ (0x10C8, 0x10CC),
+ (0x10CE, 0x10CF),
+ (0x1249, 0x1249),
+ (0x124E, 0x124F),
+ (0x1257, 0x1257),
+ (0x1259, 0x1259),
+ (0x125E, 0x125F),
+ (0x1289, 0x1289),
+ (0x128E, 0x128F),
+ (0x12B1, 0x12B1),
+ (0x12B6, 0x12B7),
+ (0x12BF, 0x12BF),
+ (0x12C1, 0x12C1),
+ (0x12C6, 0x12C7),
+ (0x12D7, 0x12D7),
+ (0x1311, 0x1311),
+ (0x1316, 0x1317),
+ (0x135B, 0x135C),
+ (0x137D, 0x137F),
+ (0x139A, 0x139F),
+ (0x13F6, 0x13F7),
+ (0x13FE, 0x13FF),
+ (0x169D, 0x169F),
+ (0x16F9, 0x16FF),
+ (0x170D, 0x170D),
+ (0x1715, 0x171F),
+ (0x1737, 0x173F),
+ (0x1754, 0x175F),
+ (0x176D, 0x176D),
+ (0x1771, 0x1771),
+ (0x1774, 0x177F),
+ (0x17DE, 0x17DF),
+ (0x17EA, 0x17EF),
+ (0x17FA, 0x17FF),
+ (0x180F, 0x180F),
+ (0x181A, 0x181F),
+ (0x1879, 0x187F),
+ (0x18AB, 0x18AF),
+ (0x18F6, 0x18FF),
+ (0x191F, 0x191F),
+ (0x192C, 0x192F),
+ (0x193C, 0x193F),
+ (0x1941, 0x1943),
+ (0x196E, 0x196F),
+ (0x1975, 0x197F),
+ (0x19AC, 0x19AF),
+ (0x19CA, 0x19CF),
+ (0x19DB, 0x19DD),
+ (0x1A1C, 0x1A1D),
+ (0x1A5F, 0x1A5F),
+ (0x1A7D, 0x1A7E),
+ (0x1A8A, 0x1A8F),
+ (0x1A9A, 0x1A9F),
+ (0x1AAE, 0x1AAF),
+ (0x1AC1, 0x1AFF),
+ (0x1B4C, 0x1B4F),
+ (0x1B7D, 0x1B7F),
+ (0x1BF4, 0x1BFB),
+ (0x1C38, 0x1C3A),
+ (0x1C4A, 0x1C4C),
+ (0x1C89, 0x1C8F),
+ (0x1CBB, 0x1CBC),
+ (0x1CC8, 0x1CCF),
+ (0x1CFB, 0x1CFF),
+ (0x1DFA, 0x1DFA),
+ (0x1F16, 0x1F17),
+ (0x1F1E, 0x1F1F),
+ (0x1F46, 0x1F47),
+ (0x1F4E, 0x1F4F),
+ (0x1F58, 0x1F58),
+ (0x1F5A, 0x1F5A),
+ (0x1F5C, 0x1F5C),
+ (0x1F5E, 0x1F5E),
+ (0x1F7E, 0x1F7F),
+ (0x1FB5, 0x1FB5),
+ (0x1FC5, 0x1FC5),
+ (0x1FD4, 0x1FD5),
+ (0x1FDC, 0x1FDC),
+ (0x1FF0, 0x1FF1),
+ (0x1FF5, 0x1FF5),
+ (0x1FFF, 0x1FFF),
+ (0x2065, 0x2065),
+ (0x2072, 0x2073),
+ (0x208F, 0x208F),
+ (0x209D, 0x209F),
+ (0x20C0, 0x20CF),
+ (0x20F1, 0x20FF),
+ (0x218C, 0x218F),
+ (0x2427, 0x243F),
+ (0x244B, 0x245F),
+ (0x2B74, 0x2B75),
+ (0x2B96, 0x2B96),
+ (0x2C2F, 0x2C2F),
+ (0x2C5F, 0x2C5F),
+ (0x2CF4, 0x2CF8),
+ (0x2D26, 0x2D26),
+ (0x2D28, 0x2D2C),
+ (0x2D2E, 0x2D2F),
+ (0x2D68, 0x2D6E),
+ (0x2D71, 0x2D7E),
+ (0x2D97, 0x2D9F),
+ (0x2DA7, 0x2DA7),
+ (0x2DAF, 0x2DAF),
+ (0x2DB7, 0x2DB7),
+ (0x2DBF, 0x2DBF),
+ (0x2DC7, 0x2DC7),
+ (0x2DCF, 0x2DCF),
+ (0x2DD7, 0x2DD7),
+ (0x2DDF, 0x2DDF),
+ (0x2E53, 0x2E7F),
+ (0x2E9A, 0x2E9A),
+ (0x2EF4, 0x2EFF),
+ (0x2FD6, 0x2FEF),
+ (0x2FFC, 0x2FFF),
+ (0x3040, 0x3040),
+ (0x3097, 0x3098),
+ (0x3100, 0x3104),
+ (0x3130, 0x3130),
+ (0x318F, 0x318F),
+ (0x31E4, 0x31EF),
+ (0x321F, 0x321F),
+ (0x3401, 0x4DBE),
+ (0x4E01, 0x9FFB),
+ (0x9FFD, 0x9FFF),
+ (0xA48D, 0xA48F),
+ (0xA4C7, 0xA4CF),
+ (0xA62C, 0xA63F),
+ (0xA6F8, 0xA6FF),
+ (0xA7C0, 0xA7C1),
+ (0xA7CB, 0xA7F4),
+ (0xA82D, 0xA82F),
+ (0xA83A, 0xA83F),
+ (0xA878, 0xA87F),
+ (0xA8C6, 0xA8CD),
+ (0xA8DA, 0xA8DF),
+ (0xA954, 0xA95E),
+ (0xA97D, 0xA97F),
+ (0xA9CE, 0xA9CE),
+ (0xA9DA, 0xA9DD),
+ (0xA9FF, 0xA9FF),
+ (0xAA37, 0xAA3F),
+ (0xAA4E, 0xAA4F),
+ (0xAA5A, 0xAA5B),
+ (0xAAC3, 0xAADA),
+ (0xAAF7, 0xAB00),
+ (0xAB07, 0xAB08),
+ (0xAB0F, 0xAB10),
+ (0xAB17, 0xAB1F),
+ (0xAB27, 0xAB27),
+ (0xAB2F, 0xAB2F),
+ (0xAB6C, 0xAB6F),
+ (0xABEE, 0xABEF),
+ (0xABFA, 0xABFF),
+ (0xAC01, 0xD7A2),
+ (0xD7A4, 0xD7AF),
+ (0xD7C7, 0xD7CA),
+ (0xD7FC, 0xD7FF),
+ (0xFA6E, 0xFA6F),
+ (0xFADA, 0xFAFF),
+ (0xFB07, 0xFB12),
+ (0xFB18, 0xFB1C),
+ (0xFB37, 0xFB37),
+ (0xFB3D, 0xFB3D),
+ (0xFB3F, 0xFB3F),
+ (0xFB42, 0xFB42),
+ (0xFB45, 0xFB45),
+ (0xFBC2, 0xFBD2),
+ (0xFD40, 0xFD4F),
+ (0xFD90, 0xFD91),
+ (0xFDC8, 0xFDEF),
+ (0xFDFE, 0xFDFF),
+ (0xFE1A, 0xFE1F),
+ (0xFE53, 0xFE53),
+ (0xFE67, 0xFE67),
+ (0xFE6C, 0xFE6F),
+ (0xFE75, 0xFE75),
+ (0xFEFD, 0xFEFE),
+ (0xFF00, 0xFF00),
+ (0xFFBF, 0xFFC1),
+ (0xFFC8, 0xFFC9),
+ (0xFFD0, 0xFFD1),
+ (0xFFD8, 0xFFD9),
+ (0xFFDD, 0xFFDF),