LineBreakCandidateIter: make iter non-recursive

A line with lots of graphemes without any breaks can overflow the stack, so make the recursion into a loop.
author: Manos Pitsidianakis <el13635@mail.ntua.gr> 2020-05-31 01:08:22 +0300
committer: Manos Pitsidianakis <el13635@mail.ntua.gr> 2020-05-31 01:08:22 +0300
commit: b3b9563db0414e4a483f158585dac9d3e7b0a91c (patch)
tree: eab08a1ad7b4b68e63d757948131902deb28d318 /melib/src/text_processing
parent: 6ceed3cae9a13764262913b4d9a1b8302b213551 (diff)
1 files changed, 519 insertions, 516 deletions
diff --git a/melib/src/text_processing/line_break.rs b/melib/src/text_processing/line_break.rs
index 7ba171c1..8e00e47e 100644
--- a/melib/src/text_processing/line_break.rs
+++ b/melib/src/text_processing/line_break.rs
@@ -130,573 +130,576 @@ macro_rules! next_grapheme_class {
 impl<'a> Iterator for LineBreakCandidateIter<'a> {
     type Item = (usize, LineBreakCandidate);
     fn next(&mut self) -> Option<Self::Item> {
-        // After end of text, there are no breaks.
-        if self.pos >= self.text.len() {
-            return None;
-        }
-        // LB3 Always break at the end of text
-        if self.pos + 1 == self.text.len() {
-            self.pos += 1;
-            return Some((self.pos, MandatoryBreak));
-        }
+        loop {
+            // After end of text, there are no breaks.
+            if self.pos >= self.text.len() {
+                return None;
+            }
+            // LB3 Always break at the end of text
+            if self.pos + 1 == self.text.len() {
+                self.pos += 1;
+                return Some((self.pos, MandatoryBreak));
+            }
 
-        let (idx, mut grapheme) = self.iter.next().unwrap();
-        let LineBreakCandidateIter {
-            ref mut iter,
-            ref text,
-            ref mut reg_ind_streak,
-            ref mut pos,
-        } = self;
-        let iter = iter.by_ref();
+            let LineBreakCandidateIter {
+                ref mut iter,
+                ref text,
+                ref mut reg_ind_streak,
+                ref mut pos,
+            } = self;
+            let (idx, mut grapheme) = iter.next().unwrap();
+            let iter = iter.by_ref();
 
-        debug_assert_eq!(idx, *pos);
+            debug_assert_eq!(idx, *pos);
 
-        // LB2 Never break at the start of text
-        if idx == 0 {
-            *pos += grapheme.len();
-            return self.next();
-        }
+            // LB2 Never break at the start of text
+            if idx == 0 {
+                *pos += grapheme.len();
+                continue;
+            }
 
-        let class = get_class!(grapheme);
+            let class = get_class!(grapheme);
 
-        if class != RI {
-            *reg_ind_streak = 0;
-        }
+            if class != RI {
+                *reg_ind_streak = 0;
+            }
 
-        /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
-         * SA, SG, and XX into other line breaking classes depending on criteria outside the scope
-         * of this algorithm.
-         *
-         * In the absence of such criteria all characters with a specific combination of original
-         * class and General_Category property value are resolved as follows:
-         * Resolved Original     General_Category
-         * AL       AI, SG, XX   Any
-         * CM       SA           Only Mn or Mc
-         * AL       SA           Any except Mn and Mc
-         * NS       SJ           Any
-         */
+            /* LB1 Assign a line breaking class to each code point of the input. Resolve AI, CB, CJ,
+             * SA, SG, and XX into other line breaking classes depending on criteria outside the scope
+             * of this algorithm.
+             *
+             * In the absence of such criteria all characters with a specific combination of original
+             * class and General_Category property value are resolved as follows:
+             * Resolved Original     General_Category
+             * AL       AI, SG, XX   Any
+             * CM       SA           Only Mn or Mc
+             * AL       SA           Any except Mn and Mc
+             * NS       SJ           Any
+             */
 
-        // TODO: LB1
+            // TODO: LB1
 
-        /* Check if next character class allows breaks before it */
-        let next_char: Option<&(usize, &str)> = iter.peek();
+            /* Check if next character class allows breaks before it */
+            let next_char: Option<&(usize, &str)> = iter.peek();
 
-        match class {
-            BK => {
-                // LB4 Always Break after hard line breaks.
-                *pos += grapheme.len();
-                return Some((*pos, MandatoryBreak));
-            }
-            // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
-            CR if next_grapheme_class!((next_char is LF)) => {
-                *pos += grapheme.len();
-                assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
-                *pos += grapheme.len();
-                return Some((*pos, MandatoryBreak));
-            }
-            CR | LF | NL => {
-                *pos += grapheme.len();
-                return Some((*pos, MandatoryBreak));
-            }
-            _ => {}
-        }
-        if let Some((_, next_grapheme)) = next_char {
-            let next_class = get_class!(next_grapheme);
-            match next_class {
-                /* LB6 Do not break before hard line breaks.  × ( BK | CR | LF | NL ) */
-                BK | CR | LF | NL => {
+            match class {
+                BK => {
+                    // LB4 Always Break after hard line breaks.
                     *pos += grapheme.len();
-                    return self.next();
+                    return Some((*pos, MandatoryBreak));
                 }
-                /* LB7 Do not break before spaces or zero width
-                 * space. × SP × ZW */
-                SP | ZW => {
+                // LB5 Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks
+                CR if next_grapheme_class!((next_char is LF)) => {
+                    *pos += grapheme.len();
+                    assert!(Some(LF) == next_grapheme_class!(iter, grapheme));
                     *pos += grapheme.len();
-                    return self.next();
+                    return Some((*pos, MandatoryBreak));
                 }
-                _ => {}
-            }
-        }
-        match class {
-            ZW => {
-                // LB8 Break before any character following a zero-width space, even if one or more
-                // spaces intervene
-                // ZW SP* ÷
-                *pos += grapheme.len();
-                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                CR | LF | NL => {
                     *pos += grapheme.len();
+                    return Some((*pos, MandatoryBreak));
                 }
-                return Some((*pos, MandatoryBreak));
+                _ => {}
             }
-            ZWJ => {
-                // LB8a Do not break after a zero width joiner.
-                *pos += grapheme.len();
-                return self.next();
+            if let Some((_, next_grapheme)) = next_char {
+                let next_class = get_class!(next_grapheme);
+                match next_class {
+                    /* LB6 Do not break before hard line breaks.  × ( BK | CR | LF | NL ) */
+                    BK | CR | LF | NL => {
+                        *pos += grapheme.len();
+                        continue;
+                    }
+                    /* LB7 Do not break before spaces or zero width
+                     * space. × SP × ZW */
+                    SP | ZW => {
+                        *pos += grapheme.len();
+                        continue;
+                    }
+                    _ => {}
+                }
             }
+            match class {
+                ZW => {
+                    // LB8 Break before any character following a zero-width space, even if one or more
+                    // spaces intervene
+                    // ZW SP* ÷
+                    *pos += grapheme.len();
+                    while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                        *pos += grapheme.len();
+                    }
+                    return Some((*pos, MandatoryBreak));
+                }
+                ZWJ => {
+                    // LB8a Do not break after a zero width joiner.
+                    *pos += grapheme.len();
+                    continue;
+                }
 
-            CM => {
-                // LB9 Do not break a combining character sequence; treat it as if it has the line
-                // breaking class of the base character in all of the following rules. Treat ZWJ as
-                // if it were CM.
-                // Treat X (CM | ZWJ)* as if it were X.
-                // where X is any line break class except BK, CR, LF, NL, SP, or ZW.
+                CM => {
+                    // LB9 Do not break a combining character sequence; treat it as if it has the line
+                    // breaking class of the base character in all of the following rules. Treat ZWJ as
+                    // if it were CM.
+                    // Treat X (CM | ZWJ)* as if it were X.
+                    // where X is any line break class except BK, CR, LF, NL, SP, or ZW.
 
-                *pos += grapheme.len();
-                return self.next();
-            }
-            WJ => {
-                /*: LB11 Do not break before or after Word joiner and related characters.*/
-                *pos += grapheme.len();
-                /* Get next grapheme */
-                if next_grapheme_class!(iter, grapheme).is_some() {
                     *pos += grapheme.len();
+                    continue;
                 }
-                return self.next();
-            }
-            GL => {
-                /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
-                *pos += grapheme.len();
-                return self.next();
-            }
-            _ => {}
-        }
-        if let Some((next_idx, next_grapheme)) = next_char {
-            let next_class = get_class!(next_grapheme);
-            match next_class {
-                GL if ![SP, BA, HY].contains(&class) => {
-                    /* LB12a Do not break before NBSP and related characters, except after spaces and
-                     * hyphens.  [^SP BA HY] × GL
-                     * Also LB12 Do not break after NBSP and related characters */
+                WJ => {
+                    /*: LB11 Do not break before or after Word joiner and related characters.*/
                     *pos += grapheme.len();
-                    return self.next();
+                    /* Get next grapheme */
+                    if next_grapheme_class!(iter, grapheme).is_some() {
+                        *pos += grapheme.len();
+                    }
+                    continue;
                 }
-                /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
-                CL | CP | EX | IS | SY => {
-                    *pos = *next_idx;
-                    return self.next();
+                GL => {
+                    /*LB12 Non-breaking characters: LB12 Do not break after NBSP and related characters.*/
+                    *pos += grapheme.len();
+                    continue;
                 }
                 _ => {}
             }
-        }
+            if let Some((next_idx, next_grapheme)) = next_char {
+                let next_class = get_class!(next_grapheme);
+                match next_class {
+                    GL if ![SP, BA, HY].contains(&class) => {
+                        /* LB12a Do not break before NBSP and related characters, except after spaces and
+                         * hyphens.  [^SP BA HY] × GL
+                         * Also LB12 Do not break after NBSP and related characters */
+                        *pos += grapheme.len();
+                        continue;
+                    }
+                    /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+                    CL | CP | EX | IS | SY => {
+                        *pos = *next_idx;
+                        continue;
+                    }
+                    _ => {}
+                }
+            }
 
-        match class {
-            /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
-            SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
-                *pos += grapheme.len();
-                while ![CL, CP, EX, IS, SY].contains(&next_grapheme_class!(iter, grapheme).unwrap())
-                {
+            match class {
+                /* LB13 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. */
+                SP if [CL, CP, EX, IS, SY].contains(&get_class!(text[idx..].trim_start())) => {
+                    *pos += grapheme.len();
+                    while ![CL, CP, EX, IS, SY]
+                        .contains(&next_grapheme_class!(iter, grapheme).unwrap())
+                    {
+                        *pos += grapheme.len();
+                    }
                     *pos += grapheme.len();
+                    continue;
                 }
-                *pos += grapheme.len();
-                return self.next();
-            }
-            OP => {
-                /* LB14 Do not break after ‘[’, even after spaces.
-                 * OP SP* ×
-                 */
-                while let Some((idx, grapheme)) = self.iter.next() {
-                    *pos = idx + grapheme.len();
-                    if !(get_class!(grapheme) == SP) {
-                        break;
+                OP => {
+                    /* LB14 Do not break after ‘[’, even after spaces.
+                     * OP SP* ×
+                     */
+                    while let Some((idx, grapheme)) = self.iter.next() {
+                        *pos = idx + grapheme.len();
+                        if !(get_class!(grapheme) == SP) {
+                            break;
+                        }
                     }
+                    continue;
                 }
-                return self.next();
-            }
-            QU if get_class!(text[idx..].trim_start()) == OP => {
-                /* LB15 Do not break within ‘”[’, even with intervening spaces.
-                 * QU SP* × OP */
-                *pos += grapheme.len();
-                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                QU if get_class!(text[idx..].trim_start()) == OP => {
+                    /* LB15 Do not break within ‘”[’, even with intervening spaces.
+                     * QU SP* × OP */
                     *pos += grapheme.len();
+                    while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                        *pos += grapheme.len();
+                    }
+                    *pos = idx;
+                    continue;
                 }
-                *pos = idx;
-                return self.next();
-            }
-            QU => {
-                /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
-                *pos += grapheme.len();
-                if let Some((_, g)) = self.iter.next() {
-                    *pos += g.len();
+                QU => {
+                    /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+                    *pos += grapheme.len();
+                    if let Some((_, g)) = self.iter.next() {
+                        *pos += g.len();
+                    }
+                    continue;
                 }
-                return self.next();
-            }
-            LineBreakClass::CL | LineBreakClass::CP
-                if get_class!(text[idx..].trim_start()) == NS =>
-            {
-                /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
-                 * intervening spaces.
-                 * (CL | CP) SP* × NS */
-                *pos += grapheme.len();
-                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                LineBreakClass::CL | LineBreakClass::CP
+                    if get_class!(text[idx..].trim_start()) == NS =>
+                {
+                    /* LB16 Do not break between closing punctuation and a nonstarter (lb=NS), even with
+                     * intervening spaces.
+                     * (CL | CP) SP* × NS */
                     *pos += grapheme.len();
+                    while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                        *pos += grapheme.len();
+                    }
+                    continue;
                 }
-                return self.next();
-            }
-            B2 if get_class!(text[idx..].trim_start()) == B2 => {
-                *pos += grapheme.len();
-                while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                B2 if get_class!(text[idx..].trim_start()) == B2 => {
                     *pos += grapheme.len();
+                    while Some(SP) == next_grapheme_class!(iter, grapheme) {
+                        *pos += grapheme.len();
+                    }
+                    continue;
                 }
-                return self.next();
-            }
-            SP => {
-                /* LB18 Break after spaces.  SP ÷ */
-                // Space 0x20 is 1 byte long.
-                *pos += 1;
-                return Some((*pos, BreakAllowed));
-            }
-            _ => {}
-        }
-        if let Some((next_idx, next_grapheme)) = next_char {
-            let next_class = get_class!(next_grapheme);
-            match next_class {
-                QU if class != SP => {
-                    /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
-                    *pos = *next_idx + next_grapheme.len();
-                    self.iter.next();
-                    return self.next();
+                SP => {
+                    /* LB18 Break after spaces.  SP ÷ */
+                    // Space 0x20 is 1 byte long.
+                    *pos += 1;
+                    return Some((*pos, BreakAllowed));
                 }
                 _ => {}
             }
-        }
-        match class {
-            CB => {
-                /* LB20 Break before and after unresolved CB. */
-                *pos += grapheme.len();
-                return Some((*pos - 1, BreakAllowed));
-            }
-            /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
-             * kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB × */
-            BB => {
-                *pos += grapheme.len();
-                return self.next();
+            if let Some((next_idx, next_grapheme)) = next_char {
+                let next_class = get_class!(next_grapheme);
+                match next_class {
+                    QU if class != SP => {
+                        /* LB19 Do not break before or after quotation marks, such as ‘ ” ’. */
+                        *pos = *next_idx + next_grapheme.len();
+                        self.iter.next();
+                        continue;
+                    }
+                    _ => {}
+                }
             }
-            _ => {}
-        }
-
-        if let Some((_, next_grapheme)) = next_char {
-            let next_class = get_class!(next_grapheme);
-            match next_class {
-                BA | HY | NS => {
-                    /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
-                     * kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB × */
+            match class {
+                CB => {
+                    /* LB20 Break before and after unresolved CB. */
+                    *pos += grapheme.len();
+                    return Some((*pos - 1, BreakAllowed));
+                }
+                /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+                 * kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB × */
+                BB => {
                     *pos += grapheme.len();
-                    return self.next();
+                    continue;
                 }
                 _ => {}
             }
-        }
-        match class {
-            HL if next_grapheme_class!((next_char is HY, BA)) => {
-                /* LB21a Don’t break after Hebrew + Hyphen.  HL (HY | BA) × */
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
+
+            if let Some((_, next_grapheme)) = next_char {
+                let next_class = get_class!(next_grapheme);
+                match next_class {
+                    BA | HY | NS => {
+                        /* LB21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small
+                         * kana, and other non-starters, or after acute accents.  × BA,  × HY, × NS,  BB × */
+                        *pos += grapheme.len();
+                        continue;
+                    }
+                    _ => {}
+                }
             }
-            /* LB21b Don’t break between ,Solidus and Hebrew letters.  SY × HL */
-            SY if next_grapheme_class!((next_char is HL)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                /* bypass next_char */
-                self.iter.next().unwrap();
-                if let Some((idx, next_grapheme)) = self.iter.next() {
+            match class {
+                HL if next_grapheme_class!((next_char is HY, BA)) => {
+                    /* LB21a Don’t break after Hebrew + Hyphen.  HL (HY | BA) × */
+                    let (idx, next_grapheme) = next_char.unwrap();
                     *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
                 }
-                return self.next();
-            }
-            /*  LB22 Do not break between two ellipses, or between letters, numbers or excla-
-             *  mations and ellipsis.
-             *  Examples: ‘9...’, ‘a...’, ‘H...’
-             *  (AL | HL) × IN */
-            AL | HL if next_grapheme_class!((next_char is IN)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /*  EX × IN */
-            EX if next_grapheme_class!((next_char is IN)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            EX => {
-                // LB13
-                *pos += grapheme.len();
-                return self.next();
-            }
-            /*  (ID | EB | EM) × IN */
-            ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /*  IN × IN */
-            IN if next_grapheme_class!((next_char is IN)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /*  NU × IN */
-            NU if next_grapheme_class!((next_char is IN)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB23 Do not break between digits and letters.
-             * (AL | HL) × NU */
-            AL | HL if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* NU × (AL | HL) */
-            NU if next_grapheme_class!((next_char is AL, HL)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
-             * and numeric postfixes.
-             * PR × (ID | EB | EM) */
-            PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* (ID | EB | EM) × PO */
-            ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* B24 Do not break between numeric prefix/postfix and letters, or between
-            letters and prefix/postfix.
-            (PR | PO) × (AL | HL)*/
-            PR | PO if next_grapheme_class!((next_char is AL, HL)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /*(AL | HL) × (PR | PO) */
-            AL | HL if next_grapheme_class!((next_char is PR, PO)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB25 Do not break between the following pairs of classes relevant to numbers:
-             * CL × PO */
-            CL if next_grapheme_class!((next_char is PO)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* CP × PO */
-            CP if next_grapheme_class!((next_char is PO)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* CL × PR */
-            CL if next_grapheme_class!((next_char is PR)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* CP × PR */
-            CP if next_grapheme_class!((next_char is PR)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* NU × PO */
-            NU if next_grapheme_class!((next_char is PO)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* NU × PR */
-            NU if next_grapheme_class!((next_char is PR)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* PO × OP */
-            PO if next_grapheme_class!((next_char is OP)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* PO × NU */
-            PO if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* PR × OP */
-            PR if next_grapheme_class!((next_char is OP)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* PR × NU */
-            PR if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* HY × NU */
-            HY if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* IS × NU */
-            IS if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* NU × NU */
-            NU if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* SY × NU */
-            SY if next_grapheme_class!((next_char is NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB26 Do not break a Korean syllable.
-             * JL × (JL | JV | H2 | H3) */
-            JL if next_grapheme_class!((next_char is JL, JV, H2, H3)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* (JV | H2) × (JV | JT) */
-            JV | H2 if next_grapheme_class!((next_char is JV, JT)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* (JT | H3) × JT */
-            JT | H3 if next_grapheme_class!((next_char is JT)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB27 Treat a Korean Syllable Block the same as ID.
-             * (JL | JV | JT | H2 | H3) × IN */
-            JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is IN)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* (JL | JV | JT | H2 | H3) × PO */
-            JL | JV | JT | H2 | H3 if next_grapheme_class!((next_char is PO)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* PR × (JL | JV | JT | H2 | H3) */
-            PR if next_grapheme_class!((next_char is JL, JV, JT, H2, H3)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB28 Do not break between alphabetics (“at”).
-            (AL | HL) × (AL | HL) */
-            AL | HL if next_grapheme_class!((next_char is AL, HL)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB29 Do not break between numeric punctuation and alphabetics (“e.g.”).
-            IS × (AL | HL) */
-            IS if next_grapheme_class!((next_char is AL, HL)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* LB30 Do not break between letters, numbers, or ordinary symbols and opening
-            or closing parentheses.
-            (AL | HL | NU) × OP */
-            AL | HL | NU if next_grapheme_class!((next_char is OP)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /* CP × (AL | HL | NU) */
-            CP if next_grapheme_class!((next_char is AL, HL , NU)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            /*LB30b Do not break between an emoji base and an emoji modifier.
-             * EB × EM */
-            EB if next_grapheme_class!((next_char is EM)) => {
-                let (idx, next_grapheme) = next_char.unwrap();
-                *pos = idx + next_grapheme.len();
-                self.iter.next();
-                return self.next();
-            }
-            RI => {
-                /* LB30a Break between two regional indicator symbols if and only if there are an
-                 * even number of regional indicators preceding the position of the break.
-                 * sot (RI RI)* RI × RI
-                 * [^RI] (RI RI)* RI × RI */
-                *reg_ind_streak += 1;
-                *pos += grapheme.len();
-                if *reg_ind_streak % 2 == 1 {
+                /* LB21b Don’t break between ,Solidus and Hebrew letters.  SY × HL */
+                SY if next_grapheme_class!((next_char is HL)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    /* bypass next_char */
+                    self.iter.next().unwrap();
+                    if let Some((idx, next_grapheme)) = self.iter.next() {
+                        *pos = idx + next_grapheme.len();
+                    }
+                    continue;
+                }
+                /*  LB22 Do not break between two ellipses, or between letters, numbers or excla-
+                 *  mations and ellipsis.
+                 *  Examples: ‘9...’, ‘a...’, ‘H...’
+                 *  (AL | HL) × IN */
+                AL | HL if next_grapheme_class!((next_char is IN)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /*  EX × IN */
+                EX if next_grapheme_class!((next_char is IN)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                EX => {
+                    // LB13
+                    *pos += grapheme.len();
+                    continue;
+                }
+                /*  (ID | EB | EM) × IN */
+                ID | EB | EM if next_grapheme_class!((next_char is IN)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /*  IN × IN */
+                IN if next_grapheme_class!((next_char is IN)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /*  NU × IN */
+                NU if next_grapheme_class!((next_char is IN)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /* LB23 Do not break between digits and letters.
+                 * (AL | HL) × NU */
+                AL | HL if next_grapheme_class!((next_char is NU)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /* NU × (AL | HL) */
+                NU if next_grapheme_class!((next_char is AL, HL)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /* LB23a Do not break between numeric prefixes and ideographs, or between ideographs
+                 * and numeric postfixes.
+                 * PR × (ID | EB | EM) */
+                PR if next_grapheme_class!((next_char is ID, EB, EM)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /* (ID | EB | EM) × PO */
+                ID | EB | EM if next_grapheme_class!((next_char is PO)) => {
+                    let (idx, next_grapheme) = next_char.unwrap();
+                    *pos = idx + next_grapheme.len();
+                    self.iter.next();
+                    continue;
+                }
+                /* B24 Do not break between numeric prefix/postfix and letters, or between
+                letters and prefix/postfix.
+                (PR | PO) × (AL |
author	Manos Pitsidianakis <el13635@mail.ntua.gr>	2020-05-31 01:08:22 +0300
committer	Manos Pitsidianakis <el13635@mail.ntua.gr>	2020-05-31 01:08:22 +0300
commit	b3b9563db0414e4a483f158585dac9d3e7b0a91c (patch)
tree	eab08a1ad7b4b68e63d757948131902deb28d318 /melib/src/text_processing
parent	6ceed3cae9a13764262913b4d9a1b8302b213551 (diff)