diff options
author | Manos Pitsidianakis <el13635@mail.ntua.gr> | 2020-05-31 01:08:22 +0300 |
---|---|---|
committer | Manos Pitsidianakis <el13635@mail.ntua.gr> | 2020-05-31 01:08:22 +0300 |
commit | b3b9563db0414e4a483f158585dac9d3e7b0a91c (patch) | |
tree | eab08a1ad7b4b68e63d757948131902deb28d318 /melib/src/text_processing | |
parent | 6ceed3cae9a13764262913b4d9a1b8302b213551 (diff) |
LineBreakCandidateIter: make iter non-recursive
A line with lots of graphemes without any breaks can overflow the stack,
so make the recursion into a loop.
Diffstat (limited to 'melib/src/text_processing')
-rw-r--r-- | melib/src/text_processing/line_break.rs | 1035 |
1 files changed, 519 insertions, 516 deletions
diff --git a/melib/src/text_processing/line_break.rs b/melib/src/text_processing/line_break.rs index 7ba171c1..8e00e47e 100644 --- a/melib/src/text_processing/line_break.rs +++ b/melib/src/text_processing/line_break.rs @@ -130,573 +130,576 @@ macro_rules! next_grapheme_class { impl<'a> Iterator for LineBreakCandidateIter<'a> { type Item = (usize, LineBreakCandidate); fn next(&mut self) -> Option<Self::Item> { - // After end of text, there are no breaks. - if self.pos >= self.text.len() { - return None; - } - // LB3 Always break at the end of text - if self.pos + 1 == self.text.len() { - self.pos += 1; - return Some((self.pos, MandatoryBreak)); - } + loop { + // After end of text, there are no breaks. + if self.pos >= self.text.len() { + return None; + } + // LB3 Always break at the end of text + if self.pos + 1 == self.text.len() { + self.pos += 1; + return Some((self.pos, MandatoryBreak)); + } - let (idx, mut grapheme) = self.iter.next().unwrap(); - let LineBreakCandidateIter { - ref mut iter, - ref text, - ref mut reg_ind_streak, - ref mut pos, - } = self; - let iter = iter.by_ref(); + let LineBreakCandidateIter { + ref mut iter, + ref text, + ref mut reg_ind_streak, + ref mut pos, + } = self; + let (idx, mut grapheme) = iter.next().unwrap(); + let iter = iter.by_ref(); - debug_assert_eq!(idx, *pos); + debug_assert_eq!(idx, *pos); - // LB2 Never break at the start of text - if idx == 0 { - *pos += grapheme.len(); - return self.next(); - } + // LB2 Never break at the start of text + if idx == 0 { + *pos += grapheme.len(); + continue; + } - let class = get_class!(grapheme); + let class = get_class!(grapheme); - if class != RI { - *reg_ind_streak = 0; - } + if class != RI { + *reg_ind_streak = 0; + } - /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, - * SA, SG, and XX into other line breaking classes depending on criteria outside the scope - * of this algorithm. - * - * In the absence of such criteria all characters with a specific combination of original - * class and General_Category property value are resolved as follows: - * Resolved Original General_Category - * AL AI, SG, XX Any - * CM SA Only Mn or Mc - * AL SA Any except Mn and Mc - * NS SJ Any - */ + /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ, + * SA, SG, and XX into other line breaking classes depending on criteria outside the scope + * of this algorithm. + * + * In the absence of such criteria all characters with a specific combination of original + * class and General_Category property value are resolved as follows: + * Resolved Original General_Category + * AL AI, SG, XX Any + * CM SA Only Mn or Mc + * AL SA Any except Mn and Mc + * NS SJ Any + */ - // TODO: LB1 + // TODO: LB1 - /* Check if next character class allows breaks before it */ - let next_char: Option<&(usize, &str)> = iter.peek(); + /* Check if next character class allows breaks before it */ + let next_char: Option<&(usize, &str)> = iter.peek(); - match class { - BK => { - // LB4 Always Break after hard line breaks. - *pos += grapheme.len(); - return Some((*pos, MandatoryBreak)); - } - // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks - CR if next_grapheme_class!((next_char is LF)) => { - *pos += grapheme.len(); - assert!(Some(LF) == next_grapheme_class!(iter, grapheme)); - *pos += grapheme.len(); - return Some((*pos, MandatoryBreak)); - } - CR | LF | NL => { - *pos += grapheme.len(); - return Some((*pos, MandatoryBreak)); - } - _ => {} - } - if let Some((_, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */ - BK | CR | LF | NL => { + match class { + BK => { + // LB4 Always Break after hard line breaks. *pos += grapheme.len(); - return self.next(); + return Some((*pos, MandatoryBreak)); } - /* LB7 Do not break before spaces or zero width - * space. × SP × ZW */ - SP | ZW => { + // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks + CR if next_grapheme_class!((next_char is LF)) => { + *pos += grapheme.len(); + assert!(Some(LF) == next_grapheme_class!(iter, grapheme)); *pos += grapheme.len(); - return self.next(); + return Some((*pos, MandatoryBreak)); } - _ => {} - } - } - match class { - ZW => { - // LB8 Break before any character following a zero-width space, even if one or more - // spaces intervene - // ZW SP* ÷ - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { + CR | LF | NL => { *pos += grapheme.len(); + return Some((*pos, MandatoryBreak)); } - return Some((*pos, MandatoryBreak)); + _ => {} } - ZWJ => { - // LB8a Do not break after a zero width joiner. - *pos += grapheme.len(); - return self.next(); + if let Some((_, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */ + BK | CR | LF | NL => { + *pos += grapheme.len(); + continue; + } + /* LB7 Do not break before spaces or zero width + * space. × SP × ZW */ + SP | ZW => { + *pos += grapheme.len(); + continue; + } + _ => {} + } } + match class { + ZW => { + // LB8 Break before any character following a zero-width space, even if one or more + // spaces intervene + // ZW SP* ÷ + *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + return Some((*pos, MandatoryBreak)); + } + ZWJ => { + // LB8a Do not break after a zero width joiner. + *pos += grapheme.len(); + continue; + } - CM => { - // LB9 Do not break a combining character sequence; treat it as if it has the line - // breaking class of the base character in all of the following rules. Treat ZWJ as - // if it were CM. - // Treat X (CM | ZWJ)* as if it were X. - // where X is any line break class except BK, CR, LF, NL, SP, or ZW. + CM => { + // LB9 Do not break a combining character sequence; treat it as if it has the line + // breaking class of the base character in all of the following rules. Treat ZWJ as + // if it were CM. + // Treat X (CM | ZWJ)* as if it were X. + // where X is any line break class except BK, CR, LF, NL, SP, or ZW. - *pos += grapheme.len(); - return self.next(); - } - WJ => { - /*: LB11 Do not break before or after Word joiner and related characters.*/ - *pos += grapheme.len(); - /* Get next grapheme */ - if next_grapheme_class!(iter, grapheme).is_some() { *pos += grapheme.len(); + continue; } - return self.next(); - } - GL => { - /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/ - *pos += grapheme.len(); - return self.next(); - } - _ => {} - } - if let Some((next_idx, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - GL if ![SP, BA, HY].contains(&class) => { - /* LB12a Do not break before NBSP and related characters, except after spaces and - * hyphens. [^SP BA HY] × GL - * Also LB12 Do not break after NBSP and related characters */ + WJ => { + /*: LB11 Do not break before or after Word joiner and related characters.*/ *pos += grapheme.len(); - return self.next(); + /* Get next grapheme */ + if next_grapheme_class!(iter, grapheme).is_some() { + *pos += grapheme.len(); + } + continue; } - /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ - CL | CP | EX | IS | SY => { - *pos = *next_idx; - return self.next(); + GL => { + /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/ + *pos += grapheme.len(); + continue; } _ => {} } - } + if let Some((next_idx, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + GL if ![SP, BA, HY].contains(&class) => { + /* LB12a Do not break before NBSP and related characters, except after spaces and + * hyphens. [^SP BA HY] × GL + * Also LB12 Do not break after NBSP and related characters */ + *pos += grapheme.len(); + continue; + } + /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ + CL | CP | EX | IS | SY => { + *pos = *next_idx; + continue; + } + _ => {} + } + } - match class { - /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ - SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => { - *pos += grapheme.len(); - while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap()) - { + match class { + /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */ + SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => { + *pos += grapheme.len(); + while ![CL, CP, EX, IS, SY] + .contains(&next_grapheme_class!(iter, grapheme).unwrap()) + { + *pos += grapheme.len(); + } *pos += grapheme.len(); + continue; } - *pos += grapheme.len(); - return self.next(); - } - OP => { - /* LB14 Do not break after ‘[’, even after spaces. - * OP SP* × - */ - while let Some((idx, grapheme)) = self.iter.next() { - *pos = idx + grapheme.len(); - if !(get_class!(grapheme) == SP) { - break; + OP => { + /* LB14 Do not break after ‘[’, even after spaces. + * OP SP* × + */ + while let Some((idx, grapheme)) = self.iter.next() { + *pos = idx + grapheme.len(); + if !(get_class!(grapheme) == SP) { + break; + } } + continue; } - return self.next(); - } - QU if get_class!(text[idx..].trim_start()) == OP => { - /* LB15 Do not break within ‘”[’, even with intervening spaces. - * QU SP* × OP */ - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { + QU if get_class!(text[idx..].trim_start()) == OP => { + /* LB15 Do not break within ‘”[’, even with intervening spaces. + * QU SP* × OP */ *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + *pos = idx; + continue; } - *pos = idx; - return self.next(); - } - QU => { - /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ - *pos += grapheme.len(); - if let Some((_, g)) = self.iter.next() { - *pos += g.len(); + QU => { + /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ + *pos += grapheme.len(); + if let Some((_, g)) = self.iter.next() { + *pos += g.len(); + } + continue; } - return self.next(); - } - LineBreakClass::CL | LineBreakClass::CP - if get_class!(text[idx..].trim_start()) == NS => - { - /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with - * intervening spaces. - * (CL | CP) SP* × NS */ - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { + LineBreakClass::CL | LineBreakClass::CP + if get_class!(text[idx..].trim_start()) == NS => + { + /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with + * intervening spaces. + * (CL | CP) SP* × NS */ *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + continue; } - return self.next(); - } - B2 if get_class!(text[idx..].trim_start()) == B2 => { - *pos += grapheme.len(); - while Some(SP) == next_grapheme_class!(iter, grapheme) { + B2 if get_class!(text[idx..].trim_start()) == B2 => { *pos += grapheme.len(); + while Some(SP) == next_grapheme_class!(iter, grapheme) { + *pos += grapheme.len(); + } + continue; } - return self.next(); - } - SP => { - /* LB18 Break after spaces. SP ÷ */ - // Space 0x20 is 1 byte long. - *pos += 1; - return Some((*pos, BreakAllowed)); - } - _ => {} - } - if let Some((next_idx, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - QU if class != SP => { - /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ - *pos = *next_idx + next_grapheme.len(); - self.iter.next(); - return self.next(); + SP => { + /* LB18 Break after spaces. SP ÷ */ + // Space 0x20 is 1 byte long. + *pos += 1; + return Some((*pos, BreakAllowed)); } _ => {} } - } - match class { - CB => { - /* LB20 Break before and after unresolved CB. */ - *pos += grapheme.len(); - return Some((*pos - 1, BreakAllowed)); - } - /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small - * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ - BB => { - *pos += grapheme.len(); - return self.next(); + if let Some((next_idx, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + QU if class != SP => { + /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */ + *pos = *next_idx + next_grapheme.len(); + self.iter.next(); + continue; + } + _ => {} + } } - _ => {} - } - - if let Some((_, next_grapheme)) = next_char { - let next_class = get_class!(next_grapheme); - match next_class { - BA | HY | NS => { - /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small - * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + match class { + CB => { + /* LB20 Break before and after unresolved CB. */ + *pos += grapheme.len(); + return Some((*pos - 1, BreakAllowed)); + } + /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small + * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + BB => { *pos += grapheme.len(); - return self.next(); + continue; } _ => {} } - } - match class { - HL if next_grapheme_class!((next_char is HY, BA)) => { - /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */ - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); + + if let Some((_, next_grapheme)) = next_char { + let next_class = get_class!(next_grapheme); + match next_class { + BA | HY | NS => { + /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small + * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */ + *pos += grapheme.len(); + continue; + } + _ => {} + } } - /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */ - SY if next_grapheme_class!((next_char is HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - /* bypass next_char */ - self.iter.next().unwrap(); - if let Some((idx, next_grapheme)) = self.iter.next() { + match class { + HL if next_grapheme_class!((next_char is HY, BA)) => { + /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */ + let (idx, next_grapheme) = next_char.unwrap(); *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; } - return self.next(); - } - /* LB22 Do not break between two ellipses, or between letters, numbers or excla- - * mations and ellipsis. - * Examples: ‘9...’, ‘a...’, ‘H...’ - * (AL | HL) × IN */ - AL | HL if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* EX × IN */ - EX if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - EX => { - // LB13 - *pos += grapheme.len(); - return self.next(); - } - /* (ID | EB | EM) × IN */ - ID | EB | EM if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* IN × IN */ - IN if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × IN */ - NU if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB23 Do not break between digits and letters. - * (AL | HL) × NU */ - AL | HL if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × (AL | HL) */ - NU if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs - * and numeric postfixes. - * PR × (ID | EB | EM) */ - PR if next_grapheme_class!((next_char is ID, EB, EM)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (ID | EB | EM) × PO */ - ID | EB | EM if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* B24 Do not break between numeric prefix/postfix and letters, or between - letters and prefix/postfix. - (PR | PO) × (AL | HL)*/ - PR | PO if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /*(AL | HL) × (PR | PO) */ - AL | HL if next_grapheme_class!((next_char is PR, PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB25 Do not break between the following pairs of classes relevant to numbers: - * CL × PO */ - CL if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CP × PO */ - CP if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CL × PR */ - CL if next_grapheme_class!((next_char is PR)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CP × PR */ - CP if next_grapheme_class!((next_char is PR)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × PO */ - NU if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × PR */ - NU if next_grapheme_class!((next_char is PR)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PO × OP */ - PO if next_grapheme_class!((next_char is OP)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PO × NU */ - PO if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PR × OP */ - PR if next_grapheme_class!((next_char is OP)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PR × NU */ - PR if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* HY × NU */ - HY if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* IS × NU */ - IS if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* NU × NU */ - NU if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* SY × NU */ - SY if next_grapheme_class!((next_char is NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB26 Do not break a Korean syllable. - * JL × (JL | JV | H2 | H3) */ - JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (JV | H2) × (JV | JT) */ - JV | H2 if next_grapheme_class!((next_char is JV, JT)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (JT | H3) × JT */ - JT | H3 if next_grapheme_class!((next_char is JT)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB27 Treat a Korean Syllable Block the same as ID. - * (JL | JV | JT | H2 | H3) × IN */ - JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* (JL | JV | JT | H2 | H3) × PO */ - JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* PR × (JL | JV | JT | H2 | H3) */ - PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB28 Do not break between alphabetics (“at”). - (AL | HL) × (AL | HL) */ - AL | HL if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”). - IS × (AL | HL) */ - IS if next_grapheme_class!((next_char is AL, HL)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* LB30 Do not break between letters, numbers, or ordinary symbols and opening - or closing parentheses. - (AL | HL | NU) × OP */ - AL | HL | NU if next_grapheme_class!((next_char is OP)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /* CP × (AL | HL | NU) */ - CP if next_grapheme_class!((next_char is AL, HL , NU)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - /*LB30b Do not break between an emoji base and an emoji modifier. - * EB × EM */ - EB if next_grapheme_class!((next_char is EM)) => { - let (idx, next_grapheme) = next_char.unwrap(); - *pos = idx + next_grapheme.len(); - self.iter.next(); - return self.next(); - } - RI => { - /* LB30a Break between two regional indicator symbols if and only if there are an - * even number of regional indicators preceding the position of the break. - * sot (RI RI)* RI × RI - * [^RI] (RI RI)* RI × RI */ - *reg_ind_streak += 1; - *pos += grapheme.len(); - if *reg_ind_streak % 2 == 1 { + /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */ + SY if next_grapheme_class!((next_char is HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + /* bypass next_char */ + self.iter.next().unwrap(); + if let Some((idx, next_grapheme)) = self.iter.next() { + *pos = idx + next_grapheme.len(); + } + continue; + } + /* LB22 Do not break between two ellipses, or between letters, numbers or excla- + * mations and ellipsis. + * Examples: ‘9...’, ‘a...’, ‘H...’ + * (AL | HL) × IN */ + AL | HL if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* EX × IN */ + EX if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + EX => { + // LB13 + *pos += grapheme.len(); + continue; + } + /* (ID | EB | EM) × IN */ + ID | EB | EM if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* IN × IN */ + IN if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × IN */ + NU if next_grapheme_class!((next_char is IN)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB23 Do not break between digits and letters. + * (AL | HL) × NU */ + AL | HL if next_grapheme_class!((next_char is NU)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* NU × (AL | HL) */ + NU if next_grapheme_class!((next_char is AL, HL)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs + * and numeric postfixes. + * PR × (ID | EB | EM) */ + PR if next_grapheme_class!((next_char is ID, EB, EM)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* (ID | EB | EM) × PO */ + ID | EB | EM if next_grapheme_class!((next_char is PO)) => { + let (idx, next_grapheme) = next_char.unwrap(); + *pos = idx + next_grapheme.len(); + self.iter.next(); + continue; + } + /* B24 Do not break between numeric prefix/postfix and letters, or between + letters and prefix/postfix. + (PR | PO) × (AL | |