diff options
author | Manos Pitsidianakis <el13635@mail.ntua.gr> | 2019-07-27 01:56:07 +0300 |
---|---|---|
committer | Manos Pitsidianakis <el13635@mail.ntua.gr> | 2019-07-27 01:56:07 +0300 |
commit | 5b679be782e4930be393654c1a3095422bcf242e (patch) | |
tree | 9382ae647d0fd6a65ad0faf25189ba843bd839a8 /text_processing | |
parent | d84ceca88e1882a8db3c7630633a3f840f4593d6 (diff) |
text_processing: implement Unicode line breaking algorithm
Not conforming to the unicode standard yet
Diffstat (limited to 'text_processing')
-rw-r--r-- | text_processing/Cargo.toml | 1 | ||||
-rw-r--r-- | text_processing/build.rs | 73 | ||||
-rw-r--r-- | text_processing/src/lib.rs | 4 | ||||
-rw-r--r-- | text_processing/src/line_break.rs | 703 | ||||
-rw-r--r-- | text_processing/src/tables.rs | 3389 | ||||
-rw-r--r-- | text_processing/src/types.rs | 102 |
6 files changed, 4272 insertions, 0 deletions
diff --git a/text_processing/Cargo.toml b/text_processing/Cargo.toml index 4b6b1c02..2074a00c 100644 --- a/text_processing/Cargo.toml +++ b/text_processing/Cargo.toml @@ -4,6 +4,7 @@ version = "0.0.1" #:version authors = ["Manos Pitsidianakis <el13635@mail.ntua.gr>"] workspace = ".." edition = "2018" +build = "build.rs" [dependencies] unicode-segmentation = "1.2.1" diff --git a/text_processing/build.rs b/text_processing/build.rs new file mode 100644 index 00000000..d740676b --- /dev/null +++ b/text_processing/build.rs @@ -0,0 +1,73 @@ +const LINE_BREAK_TABLE_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt"; +use std::fs::File; +use std::io::prelude::*; +use std::io::BufReader; +use std::path::PathBuf; +use std::process::Command; + +include!("src/types.rs"); + +fn main() -> Result<(), std::io::Error> { + let mod_path = PathBuf::from("src/tables.rs"); + if mod_path.exists() { + eprintln!( + "{} already exists, delete it if you want to replace it.", + mod_path.display() + ); + std::process::exit(0); + } + let mut tmpdir_path = PathBuf::from( + std::str::from_utf8(&Command::new("mktemp").arg("-d").output()?.stdout) + .unwrap() + .trim(), + ); + tmpdir_path.push("LineBreak.txt"); + Command::new("curl") + .args(&["-o", tmpdir_path.to_str().unwrap(), LINE_BREAK_TABLE_URL]) + .output()?; + + let file = File::open(&tmpdir_path)?; + let buf_reader = BufReader::new(file); + + let mut line_break_table: Vec<(u32, u32, LineBreakClass)> = Vec::with_capacity(3800); + for line in buf_reader.lines() { + let line = line.unwrap(); + if line.starts_with('#') || line.starts_with(' ') || line.is_empty() { + continue; + } + let tokens: &str = line.split_whitespace().next().unwrap(); + + let semicolon_idx: usize = tokens.chars().position(|c| c == ';').unwrap(); + /* LineBreak.txt list is ascii encoded so we can assume each char takes one byte: */ + let chars_str: &str = &tokens[..semicolon_idx]; + + let mut codepoint_iter = chars_str.split(".."); + + let first_codepoint: u32 = + u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap(); + + let sec_codepoint: u32 = codepoint_iter + .next() + .map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap()) + .unwrap_or(first_codepoint); + let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2]; + line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class))); + } + + let mut file = File::create(&mod_path)?; + file.write_all(b"use crate::types::LineBreakClass::*;\n") + .unwrap(); + file.write_all(b"use crate::types::LineBreakClass;\n\n") + .unwrap(); + file.write_all(b"const line_break_rules: &'static [(u32, u32, LineBreakClass)] = &[\n") + .unwrap(); + for l in &line_break_table { + file.write_all(format!(" (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes()) + .unwrap(); + } + file.write_all(b"];").unwrap(); + std::fs::remove_file(&tmpdir_path).unwrap(); + tmpdir_path.pop(); + std::fs::remove_dir(&tmpdir_path).unwrap(); + Ok(()) +} diff --git a/text_processing/src/lib.rs b/text_processing/src/lib.rs index 3b7b33ae..59d03123 100644 --- a/text_processing/src/lib.rs +++ b/text_processing/src/lib.rs @@ -1,4 +1,8 @@ pub mod grapheme_clusters; +pub mod line_break; +mod tables; +mod types; pub mod wcwidth; pub use grapheme_clusters::*; +pub use line_break::*; pub use wcwidth::*; diff --git a/text_processing/src/line_break.rs b/text_processing/src/line_break.rs new file mode 100644 index 00000000..5f42e7b9 --- /dev/null +++ b/text_processing/src/line_break.rs @@ -0,0 +1,703 @@ +extern crate unicode_segmentation; +use self::unicode_segmentation::UnicodeSegmentation; +use crate::tables::LINE_BREAK_RULES; +use crate::types::LineBreakClass; +use core::cmp::Ordering; +use core::iter::Peekable; +use core::str::FromStr; +use LineBreakClass::*; + +#[derive(Debug, PartialEq)] +pub enum LineBreakCandidate { + MandatoryBreak, + BreakAllowed, + // NoBreak, Not used. +} + +use LineBreakCandidate::*; + +pub struct LineBreakCandidateIter<'a> { + text: &'a str, + iter: Peekable<unicode_segmentation::GraphemeIndices<'a>>, + pos: usize, + /* Needed for rule LB30a */ + reg_ind_streak: u32, +} + +impl<'a> LineBreakCandidateIter<'a> { + pub fn new(text: &'a str) -> Self { + LineBreakCandidateIter { + text, + pos: 0, + iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(), + reg_ind_streak: 0, + } + } +} + +macro_rules! get_base_character { + ($grapheme:ident) => {{ + char::from_str($grapheme.get(0..1).unwrap_or_else(|| { + $grapheme.get(0..2).unwrap_or_else(|| { + $grapheme + .get(0..3) + .unwrap_or_else(|| $grapheme.get(0..4).unwrap()) + }) + })) + }}; + ($grapheme:expr) => {{ + char::from_str($grapheme.get(0..1).unwrap_or_else(|| { + $grapheme.get(0..2).unwrap_or_else(|| { + $grapheme + .get(0..3) + .unwrap_or_else(|| $grapheme.get(0..4).unwrap()) + }) + })) + }}; +} + +/// Side effects: none +macro_rules! get_class { + ($grapheme:ident) => {{ + get_base_character!($grapheme) + .map(|char| search_table(char as u32, LINE_BREAK_RULES)) + .unwrap_or(XX) + }}; + ($grapheme:expr) => {{ + get_base_character!($grapheme) + .map(|char| search_table(char as u32, LINE_BREAK_RULES)) + .unwrap_or(XX) + }}; +} + +/// Side effects: Updates $graph_iter and potentially $idx and $grapheme +macro_rules! next_grapheme_class { + ($graph_iter:ident, $grapheme:ident) => ({ + if let Some((_, g)) = $graph_iter.next() { + $grapheme = g; + Some(get_class!(g)) + } else { None } + }); + (($next_char:ident is $class:expr)) => ({ + $next_char.is_some() && get_class!(($next_char.unwrap().1)) == $class + }); + (($next_char:ident is $($class:ident),+)) => ({ + $next_char.is_some() && ($(get_class!(($next_char.unwrap().1)) == $class)||+) + }); +} + +/// Returns positions where breaks can happen +/// Examples: +/// ``` +/// use text_processing::{self, LineBreakCandidate::{self, *}}; +/// use text_processing::line_break::LineBreakCandidateIter; +/// +/// assert!(LineBreakCandidateIter::new("").collect::<Vec<(usize, LineBreakCandidate)>>().is_empty()); +/// assert_eq!(&[(7, BreakAllowed), (12, MandatoryBreak)], +/// LineBreakCandidateIter::new("Sample Text.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice()); +/// assert_eq!(&[(3, MandatoryBreak), (7, MandatoryBreak), (10, BreakAllowed), (17, MandatoryBreak)], +/// LineBreakCandidateIter::new("Sa\nmp\r\nle T(e)xt.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice()); +/// ``` +impl<'a> Iterator for LineBreakCandidateIter<'a> { + type Item = (usize, LineBreakCandidate); + fn next(&mut self) -> Option<Self::Item> { + // After end of text, there are no breaks. + if self.pos >= self.text.len() { + return None; + } + // LB3 Always break at the end of text + if self.pos + 1 == self.text.len() { + self.pos += 1; + return Some((self.pos, MandatoryBreak)); + } + + let (idx, mut grapheme) = self.iter.next().unwrap(); + let LineBreakCandidateIter { + ref mut iter, + ref text, + ref mut reg_ind_streak, + ref mut pos, + } = self; + let iter = iter.by_ref(); + + debug_assert_eq!(idx, *pos); + + // LB2 Never break at the start of text + if idx == 0 { + *pos += grapheme.len(); + return self.next(); + } + + let class = get_class!(grapheme); + + if class != RI { + *reg_ind_streak = 0; + } + + /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, + * SA, SG, and XX into other line breaking classes depending on criteria outside the scope + * of this algorithm. + * + * In the absence of such criteria all characters with a specific combination of original + * class and General_Category property value are resolved as follows: + * Resolved Original General_Category + * AL AI, SG, XX Any + * CM SA Only Mn or Mc + * AL SA Any except Mn and Mc + * NS SJ Any + */ + + // TODO: LB1 + + /* Check if next character class allows breaks before it */ + let next_char: Option<&(usize, &str)> = iter.peek(); + + match class { + BK => { + // LB4 Always Break after hard line breaks. + *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); + } + // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks + CR if next_grapheme_class!((next_char is LF)) => { + *pos += grapheme.len(); + assert!(Some(LF) == next_grapheme_class!(iter, grapheme)); + *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); + } + CR | LF | NL => { + *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); + } + _ => {} + } + if let Some((_, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */ + BK | CR | LF | NL => { + *pos += grapheme.len(); + return self.next(); + } + /* LB7 Do not break before spaces or zero width + * space. × SP × ZW */ + SP | ZW => { + *pos += grapheme.len(); + return self.next(); + } + _ => {} + } + } + match class { + ZW => { + // LB8 Break before any character following a zero-width space, even if one or more + // spaces intervene + // ZW SP* ÷ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + return Some((*pos, MandatoryBreak)); + } + ZWJ => { + // LB8a Do not break after a zero width joiner. + *pos += grapheme.len(); + return self.next(); + } + + CM => { + // LB9 Do not break a combining character sequence; treat it as if it has the line + // breaking class of the base character in all of the following rules. Treat ZWJ as + // if it were CM. + // Treat X (CM | ZWJ)* as if it were X. + // where X is any line break class except BK, CR, LF, NL, SP, or ZW. + + /* Unreachable since we break lines based on graphemes, not characters */ + unreachable!(); + } + WJ => { + /*: LB11 Do not break before or after Word joiner and related characters.*/ + *pos += grapheme.len(); + /* Get next grapheme */ + if next_grapheme_class!(iter, grapheme).is_some() { + *pos += grapheme.len(); + } + return self.next(); + } + GL => { + /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/ + *pos += grapheme.len(); + return self.next(); + } + _ => {} + } + if let Some((next_idx, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + GL if ![SP, BA, HY].contains(&class) => { + /* LB12a Do not break before NBSP and related characters, except after spaces and + * hyphens. [^SP BA HY] × GL + * Also LB12 Do not break after NBSP and related characters */ + *pos += grapheme.len(); + return self.next(); + } + /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ + CL | CP | EX | IS | SY => { + *pos = *next_idx; + return self.next(); + } + _ => {} + } + } + + match class { + /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ + SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => { + *pos += grapheme.len(); + while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap()) + { + *pos += grapheme.len(); + } + *pos += grapheme.len(); + return self.next(); + } + OP => { + /* LB14 Do not break after ‘[’, even after spaces. + * OP SP* × + */ + while let Some((idx, grapheme)) = self.iter.next() { + *pos = idx + grapheme.len(); + if !(get_class!(grapheme) == SP) { + break; + } + } + return self.next(); + } + QU if get_class!(text[idx..].trim_start()) == OP => { + /* LB15 Do not break within ‘”[’, even with intervening spaces. + * QU SP* × OP */ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + *pos = idx; + return self.next(); + } + QU => { + /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ + *pos += grapheme.len(); + if let Some((_, g)) = self.iter.next() { + *pos += g.len(); + } + return self.next(); + } + LineBreakClass::CL | LineBreakClass::CP + if get_class!(text[idx..].trim_start()) == NS => + { + /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with + * intervening spaces. + * (CL | CP) SP* × NS */ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + return self.next(); + } + B2 if get_class!(text[idx..].trim_start()) == B2 => { + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + return self.next(); + } + SP => { + /* LB18 Break after spaces. SP ÷ */ + // Space 0x20 is 1 byte long. + *pos += 1; + return Some((*pos, BreakAllowed)); + } + _ => {} + } + if let Some((next_idx, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + QU if class != SP => { + /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ + *pos = *next_idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + _ => {} + } + } + match class { + CB => { + /* LB20 Break before and after unresolved CB. */ + *pos += grapheme.len(); + return Some((*pos - 1, BreakAllowed)); + } + /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small + * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + BB => { + *pos += grapheme.len(); + return self.next(); + } + _ => {} + } + + if let Some((_, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + BA | HY | NS => { + /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small + * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + *pos += grapheme.len(); + return self.next(); + } + _ => {} + } + } + match class { + HL if next_grapheme_class!((next_char is HY, BA)) => { + /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */ + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */ + SY if next_grapheme_class!((next_char is HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + /* bypass next_char */ + self.iter.next().unwrap(); + if let Some((idx, next_grapheme)) = self.iter.next() { + *pos = idx + next_grapheme.len(); + } + return self.next(); + } + /* LB22 Do not break between two ellipses, or between letters, numbers or excla- + * mations and ellipsis. + * Examples: ‘9...’, ‘a...’, ‘H...’ + * (AL | HL) × IN */ + AL | HL if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* EX × IN */ + EX if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + EX => { + // LB13 + *pos += grapheme.len(); + return self.next(); + } + /* (ID | EB | EM) × IN */ + ID | EB | EM if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* IN × IN */ + IN if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* NU × IN */ + NU if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB23 Do not break between digits and letters. + * (AL | HL) × NU */ + AL | HL if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* NU × (AL | HL) */ + NU if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs + * and numeric postfixes. + * PR × (ID | EB | EM) */ + PR if next_grapheme_class!((next_char is ID, EB, EM)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* (ID | EB | EM) × PO */ + ID | EB | EM if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* B24 Do not break between numeric prefix/postfix and letters, or between + letters and prefix/postfix. + (PR | PO) × (AL | HL)*/ + PR | PO if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /*(AL | HL) × (PR | PO) */ + AL | HL if next_grapheme_class!((next_char is PR, PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB25 Do not break between the following pairs of classes relevant to numbers: + * CL × PO */ + CL if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* CP × PO */ + CP if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* CL × PR */ + CL if next_grapheme_class!((next_char is PR)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* CP × PR */ + CP if next_grapheme_class!((next_char is PR)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* NU × PO */ + NU if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* NU × PR */ + NU if next_grapheme_class!((next_char is PR)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* PO × OP */ + PO if next_grapheme_class!((next_char is OP)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* PO × NU */ + PO if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* PR × OP */ + PR if next_grapheme_class!((next_char is OP)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* PR × NU */ + PR if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* HY × NU */ + HY if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* IS × NU */ + IS if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* NU × NU */ + NU if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* SY × NU */ + SY if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB26 Do not break a Korean syllable. + * JL × (JL | JV | H2 | H3) */ + JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* (JV | H2) × (JV | JT) */ + JV | H2 if next_grapheme_class!((next_char is JV, JT)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* (JT | H3) × JT */ + JT | H3 if next_grapheme_class!((next_char is JT)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB27 Treat a Korean Syllable Block the same as ID. + * (JL | JV | JT | H2 | H3) × IN */ + JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* (JL | JV | JT | H2 | H3) × PO */ + JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* PR × (JL | JV | JT | H2 | H3) */ + PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB28 Do not break between alphabetics (“at”). + (AL | HL) × (AL | HL) */ + AL | HL if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). + IS × (AL | HL) */ + IS if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* LB30 Do not break between letters, numbers, or ordinary symbols and opening + or closing parentheses. + (AL | HL | NU) × OP */ + AL | HL | NU if next_grapheme_class!((next_char is OP)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /* CP × (AL | HL | NU) */ + CP if next_grapheme_class!((next_char is AL, HL , NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + /*LB30b Do not break between an emoji base and an emoji modifier. + * EB × EM */ + EB if next_grapheme_class!((next_char is EM)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + return self.next(); + } + RI => { + /* LB30a Break between two regional indicator symbols if and only if there are an + * even number of regional indicators preceding the position of the break. + * sot (RI RI)* RI × RI + * [^RI] (RI RI)* RI × RI */ + *reg_ind_streak += 1; + *pos += grapheme.len(); + if *reg_ind_streak % 2 == 1 { + return Some((*pos - grapheme.len(), BreakAllowed)); + } + self.iter.next(); + return self.next(); + } + _ => { + *pos += grapheme.len(); + return Some((*pos - grapheme.len(), BreakAllowed)); + } + } + } +} + +fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass { + match t.binary_search_by(|&(lo, hi, _)| { + if lo <= c && c <= hi { + Ordering::Equal + } else if hi < c { + Ordering::Less + } else { + Ordering::Greater + } + }) { + Ok(idx) => t[idx].2, + Err(_) => XX, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_line_breaks() { + let s = "Fell past it.\n\n‘Well!’ thought Alice to herself."; + let breaks = LineBreakCandidateIter::new(s).collect::<Vec<(usize, LineBreakCandidate)>>(); + let mut prev = 0; + for b in breaks { + println!("{:?}", &s[prev..b.0]); + prev = b.0; + } + println!("{:?}", &s[prev..]); + } +} diff --git a/text_processing/src/tables.rs b/text_processing/src/tables.rs new file mode 100644 index 00000000..4a6f885f --- /dev/null +++ b/text_processing/src/tables.rs @@ -0,0 +1,3389 @@ +use crate::types::LineBreakClass; +use crate::types::LineBreakClass::*; + +pub const LINE_BREAK_RULES: &'static [(u32, u32, LineBreakClass)] = &[ + (0x0, 0x8, CM), + (0x9, 0x9, BA), + (0xA, 0xA, LF), + (0xB, 0xC, BK), + (0xD, 0xD, CR), + (0xE, 0x1F, CM), + (0x20, 0x20, SP), + (0x21, 0x21, EX), + (0x22, 0x22, QU), + (0x23, 0x23, AL), + (0x24, 0x24, PR), + (0x25, 0x25, PO), + (0x26, 0x26, AL), + (0x27, 0x27, QU), + (0x28, 0x28, OP), + (0x29, 0x29, CP), + (0x2A, 0x2A, AL), + (0x2B, 0x2B, PR), + (0x2C, 0x2C, IS), + (0x2D, 0x2D, HY), + (0x2E, 0x2E, IS), + (0x2F, 0x2F, SY), + (0x30, 0x39, NU), + (0x3A, 0x3B, IS), + (0x3C, 0x3E, AL), + (0x3F, 0x3F, EX), + (0x40, 0x40, AL), + (0x41, 0x5A, AL), + (0x5B, 0x5B, OP), + (0x5C, 0x5C, PR), + (0x5D, 0x5D, CP), + (0x5E, 0x5E, AL), + (0x5F, 0x5F, AL), + (0x60, 0x60, AL), + (0x61, 0x7A, AL), + (0x7B, 0x7B, OP), + (0x7C, 0x7C, BA), + (0x7D, 0x7D, CL), + (0x7E, 0x7E, AL), + (0x7F, 0x7F, CM), + (0x80, 0x84, CM), + (0x85, 0x85, NL), + (0x86, 0x9F, CM), + (0xA0, 0xA0, GL), + (0xA1, 0xA1, OP), + (0xA2, 0xA2, PO), + (0xA3, 0xA5, PR), + (0xA6, 0xA6, AL), + (0xA7, 0xA7, AI), + (0xA8, 0xA8, AI), + (0xA9, 0xA9, AL), + (0xAA, 0xAA, AI), + (0xAB, 0xAB, QU), + (0xAC, 0xAC, AL), + (0xAD, 0xAD, BA), + (0xAE, 0xAE, AL), + (0xAF, 0xAF, AL), + (0xB0, 0xB0, PO), + (0xB1, 0xB1, PR), + (0xB2, 0xB3, AI), + (0xB4, 0xB4, BB), + (0xB5, 0xB5, AL), + (0xB6, 0xB7, AI), + (0xB8, 0xB8, AI), + (0xB9, 0xB9, AI), + (0xBA, 0xBA, AI), + (0xBB, 0xBB, QU), + (0xBC, 0xBE, AI), + (0xBF, 0xBF, OP), + (0xC0, 0xD6, AL), + (0xD7, 0xD7, AI), + (0xD8, 0xF6, AL), + (0xF7, 0xF7, AI), + (0xF8, 0xFF, AL), + (0x100, 0x17F, AL), + (0x180, 0x1BA, AL), + (0x1BB, 0x1BB, AL), + (0x1BC, 0x1BF, AL), + (0x1C0, 0x1C3, AL), + (0x1C4, 0x24F, AL), + (0x250, 0x293, AL), + (0x294, 0x294, AL), + (0x295, 0x2AF, AL), + (0x2B0, 0x2C1, AL), + (0x2C2, 0x2C5, AL), + (0x2C6, 0x2C6, AL), + (0x2C7, 0x2C7, AI), + (0x2C8, 0x2C8, BB), + (0x2C9, 0x2CB, AI), + (0x2CC, 0x2CC, BB), + (0x2CD, 0x2CD, AI), + (0x2CE, 0x2CF, AL), + (0x2D0, 0x2D0, AI), + (0x2D1, 0x2D1, AL), + (0x2D2, 0x2D7, AL), + (0x2D8, 0x2DB, AI), + (0x2DC, 0x2DC, AL), + (0x2DD, 0x2DD, AI), + (0x2DE, 0x2DE, AL), + (0x2DF, 0x2DF, BB), + (0x2E0, 0x2E4, AL), + (0x2E5, 0x2EB, AL), + (0x2EC, 0x2EC, AL), + (0x2ED, 0x2ED, AL), + (0x2EE, 0x2EE, AL), |