summaryrefslogtreecommitdiffstats
path: root/text_processing
diff options
context:
space:
mode:
authorManos Pitsidianakis <el13635@mail.ntua.gr>2019-07-27 01:56:07 +0300
committerManos Pitsidianakis <el13635@mail.ntua.gr>2019-07-27 01:56:07 +0300
commit5b679be782e4930be393654c1a3095422bcf242e (patch)
tree9382ae647d0fd6a65ad0faf25189ba843bd839a8 /text_processing
parentd84ceca88e1882a8db3c7630633a3f840f4593d6 (diff)
text_processing: implement Unicode line breaking algorithm
Not conforming to the unicode standard yet
Diffstat (limited to 'text_processing')
-rw-r--r--text_processing/Cargo.toml1
-rw-r--r--text_processing/build.rs73
-rw-r--r--text_processing/src/lib.rs4
-rw-r--r--text_processing/src/line_break.rs703
-rw-r--r--text_processing/src/tables.rs3389
-rw-r--r--text_processing/src/types.rs102
6 files changed, 4272 insertions, 0 deletions
diff --git a/text_processing/Cargo.toml b/text_processing/Cargo.toml
index 4b6b1c02..2074a00c 100644
--- a/text_processing/Cargo.toml
+++ b/text_processing/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.0.1" #:version
authors = ["Manos Pitsidianakis <el13635@mail.ntua.gr>"]
workspace = ".."
edition = "2018"
+build = "build.rs"
[dependencies]
unicode-segmentation = "1.2.1"
diff --git a/text_processing/build.rs b/text_processing/build.rs
new file mode 100644
index 00000000..d740676b
--- /dev/null
+++ b/text_processing/build.rs
@@ -0,0 +1,73 @@
+const LINE_BREAK_TABLE_URL: &str = "http://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt";
+use std::fs::File;
+use std::io::prelude::*;
+use std::io::BufReader;
+use std::path::PathBuf;
+use std::process::Command;
+
+include!("src/types.rs");
+
+fn main() -> Result<(), std::io::Error> {
+ let mod_path = PathBuf::from("src/tables.rs");
+ if mod_path.exists() {
+ eprintln!(
+ "{} already exists, delete it if you want to replace it.",
+ mod_path.display()
+ );
+ std::process::exit(0);
+ }
+ let mut tmpdir_path = PathBuf::from(
+ std::str::from_utf8(&Command::new("mktemp").arg("-d").output()?.stdout)
+ .unwrap()
+ .trim(),
+ );
+ tmpdir_path.push("LineBreak.txt");
+ Command::new("curl")
+ .args(&["-o", tmpdir_path.to_str().unwrap(), LINE_BREAK_TABLE_URL])
+ .output()?;
+
+ let file = File::open(&tmpdir_path)?;
+ let buf_reader = BufReader::new(file);
+
+ let mut line_break_table: Vec<(u32, u32, LineBreakClass)> = Vec::with_capacity(3800);
+ for line in buf_reader.lines() {
+ let line = line.unwrap();
+ if line.starts_with('#') || line.starts_with(' ') || line.is_empty() {
+ continue;
+ }
+ let tokens: &str = line.split_whitespace().next().unwrap();
+
+ let semicolon_idx: usize = tokens.chars().position(|c| c == ';').unwrap();
+ /* LineBreak.txt list is ascii encoded so we can assume each char takes one byte: */
+ let chars_str: &str = &tokens[..semicolon_idx];
+
+ let mut codepoint_iter = chars_str.split("..");
+
+ let first_codepoint: u32 =
+ u32::from_str_radix(std::dbg!(codepoint_iter.next().unwrap()), 16).unwrap();
+
+ let sec_codepoint: u32 = codepoint_iter
+ .next()
+ .map(|v| u32::from_str_radix(std::dbg!(v), 16).unwrap())
+ .unwrap_or(first_codepoint);
+ let class = &tokens[semicolon_idx + 1..semicolon_idx + 1 + 2];
+ line_break_table.push((first_codepoint, sec_codepoint, LineBreakClass::from(class)));
+ }
+
+ let mut file = File::create(&mod_path)?;
+ file.write_all(b"use crate::types::LineBreakClass::*;\n")
+ .unwrap();
+ file.write_all(b"use crate::types::LineBreakClass;\n\n")
+ .unwrap();
+ file.write_all(b"const line_break_rules: &'static [(u32, u32, LineBreakClass)] = &[\n")
+ .unwrap();
+ for l in &line_break_table {
+ file.write_all(format!(" (0x{:X}, 0x{:X}, {:?}),\n", l.0, l.1, l.2).as_bytes())
+ .unwrap();
+ }
+ file.write_all(b"];").unwrap();
+ std::fs::remove_file(&tmpdir_path).unwrap();
+ tmpdir_path.pop();
+ std::fs::remove_dir(&tmpdir_path).unwrap();
+ Ok(())
+}
diff --git a/text_processing/src/lib.rs b/text_processing/src/lib.rs
index 3b7b33ae..59d03123 100644
--- a/text_processing/src/lib.rs
+++ b/text_processing/src/lib.rs
@@ -1,4 +1,8 @@
pub mod grapheme_clusters;
+pub mod line_break;
+mod tables;
+mod types;
pub mod wcwidth;
pub use grapheme_clusters::*;
+pub use line_break::*;
pub use wcwidth::*;
diff --git a/text_processing/src/line_break.rs b/text_processing/src/line_break.rs
new file mode 100644
index 00000000..5f42e7b9
--- /dev/null
+++ b/text_processing/src/line_break.rs
@@ -0,0 +1,703 @@
+extern crate unicode_segmentation;
+use self::unicode_segmentation::UnicodeSegmentation;
+use crate::tables::LINE_BREAK_RULES;
+use crate::types::LineBreakClass;
+use core::cmp::Ordering;
+use core::iter::Peekable;
+use core::str::FromStr;
+use LineBreakClass::*;
+
+#[derive(Debug, PartialEq)]
+pub enum LineBreakCandidate {
+ MandatoryBreak,
+ BreakAllowed,
+ // NoBreak, Not used.
+}
+
+use LineBreakCandidate::*;
+
+pub struct LineBreakCandidateIter<'a> {
+ text: &'a str,
+ iter: Peekable<unicode_segmentation::GraphemeIndices<'a>>,
+ pos: usize,
+ /* Needed for rule LB30a */
+ reg_ind_streak: u32,
+}
+
+impl<'a> LineBreakCandidateIter<'a> {
+ pub fn new(text: &'a str) -> Self {
+ LineBreakCandidateIter {
+ text,
+ pos: 0,
+ iter: UnicodeSegmentation::grapheme_indices(text, true).peekable(),
+ reg_ind_streak: 0,
+ }
+ }
+}
+
+macro_rules! get_base_character {
+ ($grapheme:ident) => {{
+ char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
+ $grapheme.get(0..2).unwrap_or_else(|| {
+ $grapheme
+ .get(0..3)
+ .unwrap_or_else(|| $grapheme.get(0..4).unwrap())
+ })
+ }))
+ }};
+ ($grapheme:expr) => {{
+ char::from_str($grapheme.get(0..1).unwrap_or_else(|| {
+ $grapheme.get(0..2).unwrap_or_else(|| {
+ $grapheme
+ .get(0..3)
+ .unwrap_or_else(|| $grapheme.get(0..4).unwrap())
+ })
+ }))
+ }};
+}
+
+/// Side effects: none
+macro_rules! get_class {
+ ($grapheme:ident) => {{
+ get_base_character!($grapheme)
+ .map(|char| search_table(char as u32, LINE_BREAK_RULES))
+ .unwrap_or(XX)
+ }};
+ ($grapheme:expr) => {{
+ get_base_character!($grapheme)
+ .map(|char| search_table(char as u32, LINE_BREAK_RULES))
+ .unwrap_or(XX)
+ }};
+}
+
+/// Side effects: Updates $graph_iter and potentially $idx and $grapheme
+macro_rules! next_grapheme_class {
+ ($graph_iter:ident, $grapheme:ident) => ({
+ if let Some((_, g)) = $graph_iter.next() {
+ $grapheme = g;
+ Some(get_class!(g))
+ } else { None }
+ });
+ (($next_char:ident is $class:expr)) => ({
+ $next_char.is_some() && get_class!(($next_char.unwrap().1)) == $class
+ });
+ (($next_char:ident is $($class:ident),+)) => ({
+ $next_char.is_some() && ($(get_class!(($next_char.unwrap().1)) == $class)||+)
+ });
+}
+
+/// Returns positions where breaks can happen
+/// Examples:
+/// ```
+/// use text_processing::{self, LineBreakCandidate::{self, *}};
+/// use text_processing::line_break::LineBreakCandidateIter;
+///
+/// assert!(LineBreakCandidateIter::new("").collect::<Vec<(usize, LineBreakCandidate)>>().is_empty());
+/// assert_eq!(&[(7, BreakAllowed), (12, MandatoryBreak)],
+/// LineBreakCandidateIter::new("Sample Text.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice());
+/// assert_eq!(&[(3, MandatoryBreak), (7, MandatoryBreak), (10, BreakAllowed), (17, MandatoryBreak)],
+/// LineBreakCandidateIter::new("Sa\nmp\r\nle T(e)xt.").collect::<Vec<(usize, LineBreakCandidate)>>().as_slice());
+/// ```
+impl<'a> Iterator for LineBreakCandidateIter<'a> {
+ type Item = (usize, LineBreakCandidate);
+ fn next(&mut self) -> Option<Self::Item> {
+ // After end of text, there are no breaks.
+ if self.pos >= self.text.len() {
+ return None;
+ }
+ // LB3 Always break at the end of text
+ if self.pos + 1 == self.text.len() {
+ self.pos += 1;
+ return Some((self.pos, MandatoryBreak));
+ }
+
+ let (idx, mut grapheme) = self.iter.next().unwrap();
+ let LineBreakCandidateIter {
+ ref mut iter,
+ ref text,
+ ref mut reg_ind_streak,
+ ref mut pos,
+ } = self;
+ let iter = iter.by_ref();
+
+ debug_assert_eq!(idx, *pos);
+
+ // LB2 Never break at the start of text
+ if idx == 0 {
+ *pos += grapheme.len();
+ return self.next();
+ }
+
+ let class = get_class!(grapheme);
+
+ if class != RI {
+ *reg_ind_streak = 0;
+ }
+
+ /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
+ * SA, SG, and XX into other line breaking classes depending on criteria outside the scope
+ * of this algorithm.
+ *
+ * In the absence of such criteria all characters with a specific combination of original
+ * class and General_Category property value are resolved as follows:
+ * Resolved Original General_Category
+ * AL AI, SG, XX Any
+ * CM SA Only Mn or Mc
+ * AL SA Any except Mn and Mc
+ * NS SJ Any
+ */
+
+ // TODO: LB1
+
+ /* Check if next character class allows breaks before it */
+ let next_char: Option<&(usize, &str)> = iter.peek();
+
+ match class {
+ BK => {
+ // LB4 Always Break after hard line breaks.
+ *pos += grapheme.len();
+ return Some((*pos, MandatoryBreak));
+ }
+ // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
+ CR if next_grapheme_class!((next_char is LF)) => {
+ *pos += grapheme.len();
+ assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
+ *pos += grapheme.len();
+ return Some((*pos, MandatoryBreak));
+ }
+ CR | LF | NL => {
+ *pos += grapheme.len();
+ return Some((*pos, MandatoryBreak));
+ }
+ _ => {}
+ }
+ if let Some((_, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */
+ BK | CR | LF | NL => {
+ *pos += grapheme.len();
+ return self.next();
+ }
+ /* LB7 Do not break before spaces or zero width
+ * space. × SP × ZW */
+ SP | ZW => {
+ *pos += grapheme.len();
+ return self.next();
+ }
+ _ => {}
+ }
+ }
+ match class {
+ ZW => {
+ // LB8 Break before any character following a zero-width space, even if one or more
+ // spaces intervene
+ // ZW SP* ÷
+ *pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ return Some((*pos, MandatoryBreak));
+ }
+ ZWJ => {
+ // LB8a Do not break after a zero width joiner.
+ *pos += grapheme.len();
+ return self.next();
+ }
+
+ CM => {
+ // LB9 Do not break a combining character sequence; treat it as if it has the line
+ // breaking class of the base character in all of the following rules. Treat ZWJ as
+ // if it were CM.
+ // Treat X (CM | ZWJ)* as if it were X.
+ // where X is any line break class except BK, CR, LF, NL, SP, or ZW.
+
+ /* Unreachable since we break lines based on graphemes, not characters */
+ unreachable!();
+ }
+ WJ => {
+ /*: LB11 Do not break before or after Word joiner and related characters.*/
+ *pos += grapheme.len();
+ /* Get next grapheme */
+ if next_grapheme_class!(iter, grapheme).is_some() {
+ *pos += grapheme.len();
+ }
+ return self.next();
+ }
+ GL => {
+ /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
+ *pos += grapheme.len();
+ return self.next();
+ }
+ _ => {}
+ }
+ if let Some((next_idx, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ GL if ![SP, BA, HY].contains(&class) => {
+ /* LB12a Do not break before NBSP and related characters, except after spaces and
+ * hyphens. [^SP BA HY] × GL
+ * Also LB12 Do not break after NBSP and related characters */
+ *pos += grapheme.len();
+ return self.next();
+ }
+ /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+ CL | CP | EX | IS | SY => {
+ *pos = *next_idx;
+ return self.next();
+ }
+ _ => {}
+ }
+ }
+
+ match class {
+ /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+ SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
+ *pos += grapheme.len();
+ while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
+ {
+ *pos += grapheme.len();
+ }
+ *pos += grapheme.len();
+ return self.next();
+ }
+ OP => {
+ /* LB14 Do not break after ‘[’, even after spaces.
+ * OP SP* ×
+ */
+ while let Some((idx, grapheme)) = self.iter.next() {
+ *pos = idx + grapheme.len();
+ if !(get_class!(grapheme) == SP) {
+ break;
+ }
+ }
+ return self.next();
+ }
+ QU if get_class!(text[idx..].trim_start()) == OP => {
+ /* LB15 Do not break within ‘”[’, even with intervening spaces.
+ * QU SP* × OP */
+ *pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ *pos = idx;
+ return self.next();
+ }
+ QU => {
+ /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+ *pos += grapheme.len();
+ if let Some((_, g)) = self.iter.next() {
+ *pos += g.len();
+ }
+ return self.next();
+ }
+ LineBreakClass::CL | LineBreakClass::CP
+ if get_class!(text[idx..].trim_start()) == NS =>
+ {
+ /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
+ * intervening spaces.
+ * (CL | CP) SP* × NS */
+ *pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ return self.next();
+ }
+ B2 if get_class!(text[idx..].trim_start()) == B2 => {
+ *pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ return self.next();
+ }
+ SP => {
+ /* LB18 Break after spaces. SP ÷ */
+ // Space 0x20 is 1 byte long.
+ *pos += 1;
+ return Some((*pos, BreakAllowed));
+ }
+ _ => {}
+ }
+ if let Some((next_idx, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ QU if class != SP => {
+ /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+ *pos = *next_idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ _ => {}
+ }
+ }
+ match class {
+ CB => {
+ /* LB20 Break before and after unresolved CB. */
+ *pos += grapheme.len();
+ return Some((*pos - 1, BreakAllowed));
+ }
+ /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+ * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
+ BB => {
+ *pos += grapheme.len();
+ return self.next();
+ }
+ _ => {}
+ }
+
+ if let Some((_, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ BA | HY | NS => {
+ /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+ * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
+ *pos += grapheme.len();
+ return self.next();
+ }
+ _ => {}
+ }
+ }
+ match class {
+ HL if next_grapheme_class!((next_char is HY, BA)) => {
+ /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */
+ SY if next_grapheme_class!((next_char is HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ /* bypass next_char */
+ self.iter.next().unwrap();
+ if let Some((idx, next_grapheme)) = self.iter.next() {
+ *pos = idx + next_grapheme.len();
+ }
+ return self.next();
+ }
+ /* LB22 Do not break between two ellipses, or between letters, numbers or excla-
+ * mations and ellipsis.
+ * Examples: ‘9...’, ‘a...’, ‘H...’
+ * (AL | HL) × IN */
+ AL | HL if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* EX × IN */
+ EX if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ EX => {
+ // LB13
+ *pos += grapheme.len();
+ return self.next();
+ }
+ /* (ID | EB | EM) × IN */
+ ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* IN × IN */
+ IN if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* NU × IN */
+ NU if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB23 Do not break between digits and letters.
+ * (AL | HL) × NU */
+ AL | HL if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* NU × (AL | HL) */
+ NU if next_grapheme_class!((next_char is AL, HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
+ * and numeric postfixes.
+ * PR × (ID | EB | EM) */
+ PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* (ID | EB | EM) × PO */
+ ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* B24 Do not break between numeric prefix/postfix and letters, or between
+ letters and prefix/postfix.
+ (PR | PO) × (AL | HL)*/
+ PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /*(AL | HL) × (PR | PO) */
+ AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB25 Do not break between the following pairs of classes relevant to numbers:
+ * CL × PO */
+ CL if next_grapheme_class!((next_char is PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* CP × PO */
+ CP if next_grapheme_class!((next_char is PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* CL × PR */
+ CL if next_grapheme_class!((next_char is PR)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* CP × PR */
+ CP if next_grapheme_class!((next_char is PR)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* NU × PO */
+ NU if next_grapheme_class!((next_char is PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* NU × PR */
+ NU if next_grapheme_class!((next_char is PR)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* PO × OP */
+ PO if next_grapheme_class!((next_char is OP)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* PO × NU */
+ PO if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* PR × OP */
+ PR if next_grapheme_class!((next_char is OP)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* PR × NU */
+ PR if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* HY × NU */
+ HY if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* IS × NU */
+ IS if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* NU × NU */
+ NU if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* SY × NU */
+ SY if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB26 Do not break a Korean syllable.
+ * JL × (JL | JV | H2 | H3) */
+ JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* (JV | H2) × (JV | JT) */
+ JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* (JT | H3) × JT */
+ JT | H3 if next_grapheme_class!((next_char is JT)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB27 Treat a Korean Syllable Block the same as ID.
+ * (JL | JV | JT | H2 | H3) × IN */
+ JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* (JL | JV | JT | H2 | H3) × PO */
+ JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* PR × (JL | JV | JT | H2 | H3) */
+ PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB28 Do not break between alphabetics (“at”).
+ (AL | HL) × (AL | HL) */
+ AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
+ IS × (AL | HL) */
+ IS if next_grapheme_class!((next_char is AL, HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* LB30 Do not break between letters, numbers, or ordinary symbols and opening
+ or closing parentheses.
+ (AL | HL | NU) × OP */
+ AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /* CP × (AL | HL | NU) */
+ CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ /*LB30b Do not break between an emoji base and an emoji modifier.
+ * EB × EM */
+ EB if next_grapheme_class!((next_char is EM)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ return self.next();
+ }
+ RI => {
+ /* LB30a Break between two regional indicator symbols if and only if there are an
+ * even number of regional indicators preceding the position of the break.
+ * sot (RI RI)* RI × RI
+ * [^RI] (RI RI)* RI × RI */
+ *reg_ind_streak += 1;
+ *pos += grapheme.len();
+ if *reg_ind_streak % 2 == 1 {
+ return Some((*pos - grapheme.len(), BreakAllowed));
+ }
+ self.iter.next();
+ return self.next();
+ }
+ _ => {
+ *pos += grapheme.len();
+ return Some((*pos - grapheme.len(), BreakAllowed));
+ }
+ }
+ }
+}
+
+fn search_table(c: u32, t: &'static [(u32, u32, LineBreakClass)]) -> LineBreakClass {
+ match t.binary_search_by(|&(lo, hi, _)| {
+ if lo <= c && c <= hi {
+ Ordering::Equal
+ } else if hi < c {
+ Ordering::Less
+ } else {
+ Ordering::Greater
+ }
+ }) {
+ Ok(idx) => t[idx].2,
+ Err(_) => XX,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_line_breaks() {
+ let s = "Fell past it.\n\n‘Well!’ thought Alice to herself.";
+ let breaks = LineBreakCandidateIter::new(s).collect::<Vec<(usize, LineBreakCandidate)>>();
+ let mut prev = 0;
+ for b in breaks {
+ println!("{:?}", &s[prev..b.0]);
+ prev = b.0;
+ }
+ println!("{:?}", &s[prev..]);
+ }
+}
diff --git a/text_processing/src/tables.rs b/text_processing/src/tables.rs
new file mode 100644
index 00000000..4a6f885f
--- /dev/null
+++ b/text_processing/src/tables.rs
@@ -0,0 +1,3389 @@
+use crate::types::LineBreakClass;
+use crate::types::LineBreakClass::*;
+
+pub const LINE_BREAK_RULES: &'static [(u32, u32, LineBreakClass)] = &[
+ (0x0, 0x8, CM),
+ (0x9, 0x9, BA),
+ (0xA, 0xA, LF),
+ (0xB, 0xC, BK),
+ (0xD, 0xD, CR),
+ (0xE, 0x1F, CM),
+ (0x20, 0x20, SP),
+ (0x21, 0x21, EX),
+ (0x22, 0x22, QU),
+ (0x23, 0x23, AL),
+ (0x24, 0x24, PR),
+ (0x25, 0x25, PO),
+ (0x26, 0x26, AL),
+ (0x27, 0x27, QU),
+ (0x28, 0x28, OP),
+ (0x29, 0x29, CP),
+ (0x2A, 0x2A, AL),
+ (0x2B, 0x2B, PR),
+ (0x2C, 0x2C, IS),
+ (0x2D, 0x2D, HY),
+ (0x2E, 0x2E, IS),
+ (0x2F, 0x2F, SY),
+ (0x30, 0x39, NU),
+ (0x3A, 0x3B, IS),
+ (0x3C, 0x3E, AL),
+ (0x3F, 0x3F, EX),
+ (0x40, 0x40, AL),
+ (0x41, 0x5A, AL),
+ (0x5B, 0x5B, OP),
+ (0x5C, 0x5C, PR),
+ (0x5D, 0x5D, CP),
+ (0x5E, 0x5E, AL),
+ (0x5F, 0x5F, AL),
+ (0x60, 0x60, AL),
+ (0x61, 0x7A, AL),
+ (0x7B, 0x7B, OP),
+ (0x7C, 0x7C, BA),
+ (0x7D, 0x7D, CL),
+ (0x7E, 0x7E, AL),
+ (0x7F, 0x7F, CM),
+ (0x80, 0x84, CM),
+ (0x85, 0x85, NL),
+ (0x86, 0x9F, CM),
+ (0xA0, 0xA0, GL),
+ (0xA1, 0xA1, OP),
+ (0xA2, 0xA2, PO),
+ (0xA3, 0xA5, PR),
+ (0xA6, 0xA6, AL),
+ (0xA7, 0xA7, AI),
+ (0xA8, 0xA8, AI),
+ (0xA9, 0xA9, AL),
+ (0xAA, 0xAA, AI),
+ (0xAB, 0xAB, QU),
+ (0xAC, 0xAC, AL),
+ (0xAD, 0xAD, BA),
+ (0xAE, 0xAE, AL),
+ (0xAF, 0xAF, AL),
+ (0xB0, 0xB0, PO),
+ (0xB1, 0xB1, PR),
+ (0xB2, 0xB3, AI),
+ (0xB4, 0xB4, BB),
+ (0xB5, 0xB5, AL),
+ (0xB6, 0xB7, AI),
+ (0xB8, 0xB8, AI),
+ (0xB9, 0xB9, AI),
+ (0xBA, 0xBA, AI),
+ (0xBB, 0xBB, QU),
+ (0xBC, 0xBE, AI),
+ (0xBF, 0xBF, OP),
+ (0xC0, 0xD6, AL),
+ (0xD7, 0xD7, AI),
+ (0xD8, 0xF6, AL),
+ (0xF7, 0xF7, AI),
+ (0xF8, 0xFF, AL),
+ (0x100, 0x17F, AL),
+ (0x180, 0x1BA, AL),
+ (0x1BB, 0x1BB, AL),
+ (0x1BC, 0x1BF, AL),
+ (0x1C0, 0x1C3, AL),
+ (0x1C4, 0x24F, AL),
+ (0x250, 0x293, AL),
+ (0x294, 0x294, AL),
+ (0x295, 0x2AF, AL),
+ (0x2B0, 0x2C1, AL),
+ (0x2C2, 0x2C5, AL),
+ (0x2C6, 0x2C6, AL),
+ (0x2C7, 0x2C7, AI),
+ (0x2C8, 0x2C8, BB),
+ (0x2C9, 0x2CB, AI),
+ (0x2CC, 0x2CC, BB),
+ (0x2CD, 0x2CD, AI),
+ (0x2CE, 0x2CF, AL),
+ (0x2D0, 0x2D0, AI),
+ (0x2D1, 0x2D1, AL),
+ (0x2D2, 0x2D7, AL),
+ (0x2D8, 0x2DB, AI),
+ (0x2DC, 0x2DC, AL),
+ (0x2DD, 0x2DD, AI),
+ (0x2DE, 0x2DE, AL),
+ (0x2DF, 0x2DF, BB),
+ (0x2E0, 0x2E4, AL),
+ (0x2E5, 0x2EB, AL),
+ (0x2EC, 0x2EC, AL),
+ (0x2ED, 0x2ED, AL),
+ (0x2EE, 0x2EE, AL),