summaryrefslogtreecommitdiffstats
path: root/melib/src/text_processing
diff options
context:
space:
mode:
authorManos Pitsidianakis <el13635@mail.ntua.gr>2020-05-31 01:08:22 +0300
committerManos Pitsidianakis <el13635@mail.ntua.gr>2020-05-31 01:08:22 +0300
commitb3b9563db0414e4a483f158585dac9d3e7b0a91c (patch)
treeeab08a1ad7b4b68e63d757948131902deb28d318 /melib/src/text_processing
parent6ceed3cae9a13764262913b4d9a1b8302b213551 (diff)
LineBreakCandidateIter: make iter non-recursive
A line with lots of graphemes without any breaks can overflow the stack, so make the recursion into a loop.
Diffstat (limited to 'melib/src/text_processing')
-rw-r--r--melib/src/text_processing/line_break.rs1035
1 files changed, 519 insertions, 516 deletions
diff --git a/melib/src/text_processing/line_break.rs b/melib/src/text_processing/line_break.rs
index 7ba171c1..8e00e47e 100644
--- a/melib/src/text_processing/line_break.rs
+++ b/melib/src/text_processing/line_break.rs
@@ -130,573 +130,576 @@ macro_rules! next_grapheme_class {
impl<'a> Iterator for LineBreakCandidateIter<'a> {
type Item = (usize, LineBreakCandidate);
fn next(&mut self) -> Option<Self::Item> {
- // After end of text, there are no breaks.
- if self.pos >= self.text.len() {
- return None;
- }
- // LB3 Always break at the end of text
- if self.pos + 1 == self.text.len() {
- self.pos += 1;
- return Some((self.pos, MandatoryBreak));
- }
+ loop {
+ // After end of text, there are no breaks.
+ if self.pos >= self.text.len() {
+ return None;
+ }
+ // LB3 Always break at the end of text
+ if self.pos + 1 == self.text.len() {
+ self.pos += 1;
+ return Some((self.pos, MandatoryBreak));
+ }
- let (idx, mut grapheme) = self.iter.next().unwrap();
- let LineBreakCandidateIter {
- ref mut iter,
- ref text,
- ref mut reg_ind_streak,
- ref mut pos,
- } = self;
- let iter = iter.by_ref();
+ let LineBreakCandidateIter {
+ ref mut iter,
+ ref text,
+ ref mut reg_ind_streak,
+ ref mut pos,
+ } = self;
+ let (idx, mut grapheme) = iter.next().unwrap();
+ let iter = iter.by_ref();
- debug_assert_eq!(idx, *pos);
+ debug_assert_eq!(idx, *pos);
- // LB2 Never break at the start of text
- if idx == 0 {
- *pos += grapheme.len();
- return self.next();
- }
+ // LB2 Never break at the start of text
+ if idx == 0 {
+ *pos += grapheme.len();
+ continue;
+ }
- let class = get_class!(grapheme);
+ let class = get_class!(grapheme);
- if class != RI {
- *reg_ind_streak = 0;
- }
+ if class != RI {
+ *reg_ind_streak = 0;
+ }
- /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
- * SA, SG, and XX into other line breaking classes depending on criteria outside the scope
- * of this algorithm.
- *
- * In the absence of such criteria all characters with a specific combination of original
- * class and General_Category property value are resolved as follows:
- * Resolved Original General_Category
- * AL AI, SG, XX Any
- * CM SA Only Mn or Mc
- * AL SA Any except Mn and Mc
- * NS SJ Any
- */
+ /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
+ * SA, SG, and XX into other line breaking classes depending on criteria outside the scope
+ * of this algorithm.
+ *
+ * In the absence of such criteria all characters with a specific combination of original
+ * class and General_Category property value are resolved as follows:
+ * Resolved Original General_Category
+ * AL AI, SG, XX Any
+ * CM SA Only Mn or Mc
+ * AL SA Any except Mn and Mc
+ * NS SJ Any
+ */
- // TODO: LB1
+ // TODO: LB1
- /* Check if next character class allows breaks before it */
- let next_char: Option<&(usize, &str)> = iter.peek();
+ /* Check if next character class allows breaks before it */
+ let next_char: Option<&(usize, &str)> = iter.peek();
- match class {
- BK => {
- // LB4 Always Break after hard line breaks.
- *pos += grapheme.len();
- return Some((*pos, MandatoryBreak));
- }
- // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
- CR if next_grapheme_class!((next_char is LF)) => {
- *pos += grapheme.len();
- assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
- *pos += grapheme.len();
- return Some((*pos, MandatoryBreak));
- }
- CR | LF | NL => {
- *pos += grapheme.len();
- return Some((*pos, MandatoryBreak));
- }
- _ => {}
- }
- if let Some((_, next_grapheme)) = next_char {
- let next_class = get_class!(next_grapheme);
- match next_class {
- /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */
- BK | CR | LF | NL => {
+ match class {
+ BK => {
+ // LB4 Always Break after hard line breaks.
*pos += grapheme.len();
- return self.next();
+ return Some((*pos, MandatoryBreak));
}
- /* LB7 Do not break before spaces or zero width
- * space. × SP × ZW */
- SP | ZW => {
+ // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
+ CR if next_grapheme_class!((next_char is LF)) => {
+ *pos += grapheme.len();
+ assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
*pos += grapheme.len();
- return self.next();
+ return Some((*pos, MandatoryBreak));
}
- _ => {}
- }
- }
- match class {
- ZW => {
- // LB8 Break before any character following a zero-width space, even if one or more
- // spaces intervene
- // ZW SP* ÷
- *pos += grapheme.len();
- while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ CR | LF | NL => {
*pos += grapheme.len();
+ return Some((*pos, MandatoryBreak));
}
- return Some((*pos, MandatoryBreak));
+ _ => {}
}
- ZWJ => {
- // LB8a Do not break after a zero width joiner.
- *pos += grapheme.len();
- return self.next();
+ if let Some((_, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ /* LB6 Do not break before hard line breaks. × ( BK | CR | LF | NL ) */
+ BK | CR | LF | NL => {
+ *pos += grapheme.len();
+ continue;
+ }
+ /* LB7 Do not break before spaces or zero width
+ * space. × SP × ZW */
+ SP | ZW => {
+ *pos += grapheme.len();
+ continue;
+ }
+ _ => {}
+ }
}
+ match class {
+ ZW => {
+ // LB8 Break before any character following a zero-width space, even if one or more
+ // spaces intervene
+ // ZW SP* ÷
+ *pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ return Some((*pos, MandatoryBreak));
+ }
+ ZWJ => {
+ // LB8a Do not break after a zero width joiner.
+ *pos += grapheme.len();
+ continue;
+ }
- CM => {
- // LB9 Do not break a combining character sequence; treat it as if it has the line
- // breaking class of the base character in all of the following rules. Treat ZWJ as
- // if it were CM.
- // Treat X (CM | ZWJ)* as if it were X.
- // where X is any line break class except BK, CR, LF, NL, SP, or ZW.
+ CM => {
+ // LB9 Do not break a combining character sequence; treat it as if it has the line
+ // breaking class of the base character in all of the following rules. Treat ZWJ as
+ // if it were CM.
+ // Treat X (CM | ZWJ)* as if it were X.
+ // where X is any line break class except BK, CR, LF, NL, SP, or ZW.
- *pos += grapheme.len();
- return self.next();
- }
- WJ => {
- /*: LB11 Do not break before or after Word joiner and related characters.*/
- *pos += grapheme.len();
- /* Get next grapheme */
- if next_grapheme_class!(iter, grapheme).is_some() {
*pos += grapheme.len();
+ continue;
}
- return self.next();
- }
- GL => {
- /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
- *pos += grapheme.len();
- return self.next();
- }
- _ => {}
- }
- if let Some((next_idx, next_grapheme)) = next_char {
- let next_class = get_class!(next_grapheme);
- match next_class {
- GL if ![SP, BA, HY].contains(&class) => {
- /* LB12a Do not break before NBSP and related characters, except after spaces and
- * hyphens. [^SP BA HY] × GL
- * Also LB12 Do not break after NBSP and related characters */
+ WJ => {
+ /*: LB11 Do not break before or after Word joiner and related characters.*/
*pos += grapheme.len();
- return self.next();
+ /* Get next grapheme */
+ if next_grapheme_class!(iter, grapheme).is_some() {
+ *pos += grapheme.len();
+ }
+ continue;
}
- /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
- CL | CP | EX | IS | SY => {
- *pos = *next_idx;
- return self.next();
+ GL => {
+ /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
+ *pos += grapheme.len();
+ continue;
}
_ => {}
}
- }
+ if let Some((next_idx, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ GL if ![SP, BA, HY].contains(&class) => {
+ /* LB12a Do not break before NBSP and related characters, except after spaces and
+ * hyphens. [^SP BA HY] × GL
+ * Also LB12 Do not break after NBSP and related characters */
+ *pos += grapheme.len();
+ continue;
+ }
+ /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+ CL | CP | EX | IS | SY => {
+ *pos = *next_idx;
+ continue;
+ }
+ _ => {}
+ }
+ }
- match class {
- /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
- SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
- *pos += grapheme.len();
- while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
- {
+ match class {
+ /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+ SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
+ *pos += grapheme.len();
+ while ![CL, CP, EX, IS, SY]
+ .contains(&next_grapheme_class!(iter, grapheme).unwrap())
+ {
+ *pos += grapheme.len();
+ }
*pos += grapheme.len();
+ continue;
}
- *pos += grapheme.len();
- return self.next();
- }
- OP => {
- /* LB14 Do not break after ‘[’, even after spaces.
- * OP SP* ×
- */
- while let Some((idx, grapheme)) = self.iter.next() {
- *pos = idx + grapheme.len();
- if !(get_class!(grapheme) == SP) {
- break;
+ OP => {
+ /* LB14 Do not break after ‘[’, even after spaces.
+ * OP SP* ×
+ */
+ while let Some((idx, grapheme)) = self.iter.next() {
+ *pos = idx + grapheme.len();
+ if !(get_class!(grapheme) == SP) {
+ break;
+ }
}
+ continue;
}
- return self.next();
- }
- QU if get_class!(text[idx..].trim_start()) == OP => {
- /* LB15 Do not break within ‘”[’, even with intervening spaces.
- * QU SP* × OP */
- *pos += grapheme.len();
- while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ QU if get_class!(text[idx..].trim_start()) == OP => {
+ /* LB15 Do not break within ‘”[’, even with intervening spaces.
+ * QU SP* × OP */
*pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ *pos = idx;
+ continue;
}
- *pos = idx;
- return self.next();
- }
- QU => {
- /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
- *pos += grapheme.len();
- if let Some((_, g)) = self.iter.next() {
- *pos += g.len();
+ QU => {
+ /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+ *pos += grapheme.len();
+ if let Some((_, g)) = self.iter.next() {
+ *pos += g.len();
+ }
+ continue;
}
- return self.next();
- }
- LineBreakClass::CL | LineBreakClass::CP
- if get_class!(text[idx..].trim_start()) == NS =>
- {
- /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
- * intervening spaces.
- * (CL | CP) SP* × NS */
- *pos += grapheme.len();
- while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ LineBreakClass::CL | LineBreakClass::CP
+ if get_class!(text[idx..].trim_start()) == NS =>
+ {
+ /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
+ * intervening spaces.
+ * (CL | CP) SP* × NS */
*pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ continue;
}
- return self.next();
- }
- B2 if get_class!(text[idx..].trim_start()) == B2 => {
- *pos += grapheme.len();
- while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ B2 if get_class!(text[idx..].trim_start()) == B2 => {
*pos += grapheme.len();
+ while Some(SP) == next_grapheme_class!(iter, grapheme) {
+ *pos += grapheme.len();
+ }
+ continue;
}
- return self.next();
- }
- SP => {
- /* LB18 Break after spaces. SP ÷ */
- // Space 0x20 is 1 byte long.
- *pos += 1;
- return Some((*pos, BreakAllowed));
- }
- _ => {}
- }
- if let Some((next_idx, next_grapheme)) = next_char {
- let next_class = get_class!(next_grapheme);
- match next_class {
- QU if class != SP => {
- /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
- *pos = *next_idx + next_grapheme.len();
- self.iter.next();
- return self.next();
+ SP => {
+ /* LB18 Break after spaces. SP ÷ */
+ // Space 0x20 is 1 byte long.
+ *pos += 1;
+ return Some((*pos, BreakAllowed));
}
_ => {}
}
- }
- match class {
- CB => {
- /* LB20 Break before and after unresolved CB. */
- *pos += grapheme.len();
- return Some((*pos - 1, BreakAllowed));
- }
- /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
- * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
- BB => {
- *pos += grapheme.len();
- return self.next();
+ if let Some((next_idx, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ QU if class != SP => {
+ /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+ *pos = *next_idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ _ => {}
+ }
}
- _ => {}
- }
-
- if let Some((_, next_grapheme)) = next_char {
- let next_class = get_class!(next_grapheme);
- match next_class {
- BA | HY | NS => {
- /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
- * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
+ match class {
+ CB => {
+ /* LB20 Break before and after unresolved CB. */
+ *pos += grapheme.len();
+ return Some((*pos - 1, BreakAllowed));
+ }
+ /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+ * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
+ BB => {
*pos += grapheme.len();
- return self.next();
+ continue;
}
_ => {}
}
- }
- match class {
- HL if next_grapheme_class!((next_char is HY, BA)) => {
- /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
+
+ if let Some((_, next_grapheme)) = next_char {
+ let next_class = get_class!(next_grapheme);
+ match next_class {
+ BA | HY | NS => {
+ /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+ * kana, and other non-starters, or after acute accents. × BA, × HY, × NS, BB × */
+ *pos += grapheme.len();
+ continue;
+ }
+ _ => {}
+ }
}
- /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */
- SY if next_grapheme_class!((next_char is HL)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- /* bypass next_char */
- self.iter.next().unwrap();
- if let Some((idx, next_grapheme)) = self.iter.next() {
+ match class {
+ HL if next_grapheme_class!((next_char is HY, BA)) => {
+ /* LB21a Don’t break after Hebrew + Hyphen. HL (HY | BA) × */
+ let (idx, next_grapheme) = next_char.unwrap();
*pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
}
- return self.next();
- }
- /* LB22 Do not break between two ellipses, or between letters, numbers or excla-
- * mations and ellipsis.
- * Examples: ‘9...’, ‘a...’, ‘H...’
- * (AL | HL) × IN */
- AL | HL if next_grapheme_class!((next_char is IN)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* EX × IN */
- EX if next_grapheme_class!((next_char is IN)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- EX => {
- // LB13
- *pos += grapheme.len();
- return self.next();
- }
- /* (ID | EB | EM) × IN */
- ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* IN × IN */
- IN if next_grapheme_class!((next_char is IN)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* NU × IN */
- NU if next_grapheme_class!((next_char is IN)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB23 Do not break between digits and letters.
- * (AL | HL) × NU */
- AL | HL if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* NU × (AL | HL) */
- NU if next_grapheme_class!((next_char is AL, HL)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
- * and numeric postfixes.
- * PR × (ID | EB | EM) */
- PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* (ID | EB | EM) × PO */
- ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* B24 Do not break between numeric prefix/postfix and letters, or between
- letters and prefix/postfix.
- (PR | PO) × (AL | HL)*/
- PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /*(AL | HL) × (PR | PO) */
- AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB25 Do not break between the following pairs of classes relevant to numbers:
- * CL × PO */
- CL if next_grapheme_class!((next_char is PO)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* CP × PO */
- CP if next_grapheme_class!((next_char is PO)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* CL × PR */
- CL if next_grapheme_class!((next_char is PR)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* CP × PR */
- CP if next_grapheme_class!((next_char is PR)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* NU × PO */
- NU if next_grapheme_class!((next_char is PO)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* NU × PR */
- NU if next_grapheme_class!((next_char is PR)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* PO × OP */
- PO if next_grapheme_class!((next_char is OP)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* PO × NU */
- PO if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* PR × OP */
- PR if next_grapheme_class!((next_char is OP)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* PR × NU */
- PR if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* HY × NU */
- HY if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* IS × NU */
- IS if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* NU × NU */
- NU if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* SY × NU */
- SY if next_grapheme_class!((next_char is NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB26 Do not break a Korean syllable.
- * JL × (JL | JV | H2 | H3) */
- JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* (JV | H2) × (JV | JT) */
- JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* (JT | H3) × JT */
- JT | H3 if next_grapheme_class!((next_char is JT)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB27 Treat a Korean Syllable Block the same as ID.
- * (JL | JV | JT | H2 | H3) × IN */
- JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* (JL | JV | JT | H2 | H3) × PO */
- JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* PR × (JL | JV | JT | H2 | H3) */
- PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB28 Do not break between alphabetics (“at”).
- (AL | HL) × (AL | HL) */
- AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
- IS × (AL | HL) */
- IS if next_grapheme_class!((next_char is AL, HL)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* LB30 Do not break between letters, numbers, or ordinary symbols and opening
- or closing parentheses.
- (AL | HL | NU) × OP */
- AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /* CP × (AL | HL | NU) */
- CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- /*LB30b Do not break between an emoji base and an emoji modifier.
- * EB × EM */
- EB if next_grapheme_class!((next_char is EM)) => {
- let (idx, next_grapheme) = next_char.unwrap();
- *pos = idx + next_grapheme.len();
- self.iter.next();
- return self.next();
- }
- RI => {
- /* LB30a Break between two regional indicator symbols if and only if there are an
- * even number of regional indicators preceding the position of the break.
- * sot (RI RI)* RI × RI
- * [^RI] (RI RI)* RI × RI */
- *reg_ind_streak += 1;
- *pos += grapheme.len();
- if *reg_ind_streak % 2 == 1 {
+ /* LB21b Don’t break between ,Solidus and Hebrew letters. SY × HL */
+ SY if next_grapheme_class!((next_char is HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ /* bypass next_char */
+ self.iter.next().unwrap();
+ if let Some((idx, next_grapheme)) = self.iter.next() {
+ *pos = idx + next_grapheme.len();
+ }
+ continue;
+ }
+ /* LB22 Do not break between two ellipses, or between letters, numbers or excla-
+ * mations and ellipsis.
+ * Examples: ‘9...’, ‘a...’, ‘H...’
+ * (AL | HL) × IN */
+ AL | HL if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* EX × IN */
+ EX if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ EX => {
+ // LB13
+ *pos += grapheme.len();
+ continue;
+ }
+ /* (ID | EB | EM) × IN */
+ ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* IN × IN */
+ IN if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* NU × IN */
+ NU if next_grapheme_class!((next_char is IN)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* LB23 Do not break between digits and letters.
+ * (AL | HL) × NU */
+ AL | HL if next_grapheme_class!((next_char is NU)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* NU × (AL | HL) */
+ NU if next_grapheme_class!((next_char is AL, HL)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
+ * and numeric postfixes.
+ * PR × (ID | EB | EM) */
+ PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* (ID | EB | EM) × PO */
+ ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
+ let (idx, next_grapheme) = next_char.unwrap();
+ *pos = idx + next_grapheme.len();
+ self.iter.next();
+ continue;
+ }
+ /* B24 Do not break between numeric prefix/postfix and letters, or between
+ letters and prefix/postfix.
+ (PR | PO) × (AL |