Merge pull request #9 from gracinet/master

Generalized line unfolding in parser (issue #8)
author: Markus Unterwaditzer <markus@unterwaditzer.net> 2016-02-20 19:32:30 +0100
committer: Markus Unterwaditzer <markus@unterwaditzer.net> 2016-02-20 19:32:30 +0100
commit: 8d4971ef24b34d67ac91152aa6048de1d75362b6 (patch)
tree: 0282ab8afa9d8725f1d293e5493cc85f6c01ab4c
parent: 10b30e94ba8de939e052839f9b2a4a832521721e (diff)
parent: 0cb9d2c3987fae3119f59b7c3ef74f9596f355e2 (diff)
2 files changed, 176 insertions, 29 deletions
diff --git a/src/vobject/lib.rs b/src/vobject/lib.rs
index 90343c4..04ce8e3 100644
--- a/src/vobject/lib.rs
+++ b/src/vobject/lib.rs
@@ -127,8 +127,38 @@ impl<'s> Parser<'s> {
         }
     }
 
-    fn peek(&self) -> Option<char> {
-        self.input[self.pos..].chars().next()
+    /// look-ahead for next char at given offset from current position
+    /// (self.pos), taking [line unfolding]
+    /// (https://tools.ietf.org/html/rfc5545#section-3.1) into account,
+    /// without actually
+    /// consuming it (immutable self).
+    ///
+    /// Return an option for next char, and needed increment to consume it
+    /// from current position.
+    /// CR characters get always skipped, resulting in CRLF to be simplified as
+    /// LF, which seems to be acceptable because
+    /// - the remainders of the lib do accept a lone LF as a line termination
+    ///   (a bit laxer than RFC 5545)
+    /// - CR alone [is not acceptable content]
+    ///   (https://tools.ietf.org/html/rfc5545#section-3.1)
+    fn peek_at(&self, at: usize) -> Option<(char, usize)> {
+        match self.input[self.pos+at..].chars().next() {
+            None => None,
+            Some('\r') => self.peek_at(at + 1),
+            Some('\n') => {
+                match self.peek_at(at + 1) {
+                    Some((' ', offset)) | Some(('\t', offset)) =>
+                        self.peek_at(offset),
+                    _ => Some(('\n', at + 1))
+                }
+            },
+            Some(x) => { Some((x, at + x.len_utf8())) }
+        }
+    }
+
+    #[inline]
+    fn peek(&self) -> Option<(char, usize)> {
+        self.peek_at(0)
     }
 
     pub fn eof(&self) -> bool {
@@ -137,7 +167,7 @@ impl<'s> Parser<'s> {
 
     fn assert_char(&self, c: char) -> ParseResult<()> {
         let real_c = match self.peek() {
-            Some(x) => x,
+            Some((x, _)) => x,
             None => return Err(ParseError::new(format!("Expected {}, found EOL", c))),
         };
 
@@ -150,11 +180,20 @@ impl<'s> Parser<'s> {
 
     fn consume_char(&mut self) -> Option<char> {
         match self.peek() {
-            Some(x) => { self.pos += x.len_utf8(); Some(x) },
+            Some((c, offset)) => { self.pos += offset; Some(c) },
             None => None
         }
     }
 
+    /// If next peeked char is the given `c`, consume it and return `true`,
+    /// otherwise return `false`.
+    fn consume_only_char(&mut self, c: char) -> bool {
+        match self.peek() {
+            Some((d, offset)) if d == c => {self.pos += offset; true},
+            _ => false
+        }
+    }
+
     fn consume_eol(&mut self) -> ParseResult<()> {
         
         let start_pos = self.pos;
@@ -185,12 +224,39 @@ impl<'s> Parser<'s> {
         Ok(())
     }
 
-    fn consume_while<'a, F: Fn(char) -> bool>(&'a mut self, test: F) -> &'a str {
-        let start_pos = self.pos;
-        while !self.eof() && test(self.peek().unwrap()) {
-            self.consume_char();
+    // GR this used to return just a slice from input, but line unfolding
+    // makes it contradictory, unless one'd want to rescan everything.
+    // Since actually useful calls used to_owned() on the result, which
+    // does copy into a String's buffer, let's create a String right away
+    // implementation detail : instead of pushing char after char, we
+    // do it by the biggest contiguous slices possible, because I believe it
+    // to be more efficient (less checks for reallocation etc).
+    fn consume_while<F: Fn(char) -> bool>(&mut self, test: F) -> String {
+        let mut sl_start_pos = self.pos;
+        let mut res = String::new();
+        while !self.eof() {
+            match self.peek() {
+                Some((c, offset)) => {
+                    if !test(c) {
+                        break
+                    } else {
+                        if offset > c.len_utf8() {
+                            // we have some skipping and therefore need to flush
+                            res.push_str(&self.input[sl_start_pos..self.pos]);
+                            res.push(c);
+                            sl_start_pos = self.pos + offset;
+                        }
+                        self.pos += offset;
+                    }
+                },
+                _ => break
+            }
         }
-        &self.input[start_pos..self.pos]
+        // Final flush
+        if sl_start_pos < self.pos {
+            res.push_str(&self.input[sl_start_pos..self.pos])
+        }
+        res
     }
 
     pub fn consume_property(&mut self) -> ParseResult<Property> {
@@ -216,7 +282,7 @@ impl<'s> Parser<'s> {
         if rv.len() == 0 {
             Err(ParseError::new("No property name found."))
         } else {
-            Ok(rv.to_owned())
+            Ok(rv)
         }
     }
 
@@ -240,16 +306,8 @@ impl<'s> Parser<'s> {
     }
 
     fn consume_property_value<'a>(&'a mut self) -> ParseResult<String> {
-        let mut rv = String::new();
-        loop {
-            rv.push_str(self.consume_while(|x| x != '\r' && x != '\n'));
-            try!(self.sloppy_terminate_line());
-
-            match self.peek() {
-                Some(' ') | Some('\t') => self.consume_char(),
-                _ => break,
-            };
-        }
+        let rv = self.consume_while(|x| x != '\r' && x != '\n');
+        try!(self.sloppy_terminate_line());
         Ok(rv)
     }
 
@@ -269,22 +327,20 @@ impl<'s> Parser<'s> {
             x > '\u{1F}'
         };
 
-        if self.peek() == Some('"') {
-            self.consume_char();
-            let rv = self.consume_while(qsafe).to_owned();
+        if self.consume_only_char('"') {
+            let rv = self.consume_while(qsafe);
             try!(self.assert_char('"'));
             self.consume_char();
             Ok(rv)
         } else {
-            Ok(self.consume_while(|x| qsafe(x) && x != ';' && x != ':').to_owned())
+            Ok(self.consume_while(|x| qsafe(x) && x != ';' && x != ':'))
         }
     }
 
     fn consume_param<'a>(&'a mut self) -> ParseResult<(String, String)> {
         let name = try!(self.consume_param_name());
-        let value = if self.peek() == Some('=') {
-            let start_pos = self.pos;
-            self.consume_char();
+        let start_pos = self.pos;
+        let value = if self.consume_only_char('=') {
             match self.consume_param_value() {
                 Ok(x) => x,
                 Err(e) => { self.pos = start_pos; return Err(e); }
@@ -298,8 +354,7 @@ impl<'s> Parser<'s> {
 
     fn consume_params(&mut self) -> HashMap<String, String> {
         let mut rv: HashMap<String, String> = HashMap::new();
-        while self.peek() == Some(';') {
-            self.consume_char();
+        while self.consume_only_char(';') {
             match self.consume_param() {
                 Ok((name, value)) => { rv.insert(name.to_owned(), value.to_owned()); },
                 Err(_) => break,
@@ -470,3 +525,60 @@ impl ParseError {
         self.desc
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::Parser;
+
+    #[test]
+    fn test_unfold1() {
+        let mut p = Parser{input: "ab\r\n c", pos: 2};
+        assert_eq!(p.consume_char(), Some('c'));
+        assert_eq!(p.pos, 6);
+    }
+
+    #[test]
+    fn test_unfold2() {
+        let mut p = Parser{input: "ab\n\tc\nx", pos: 2};
+        assert_eq!(p.consume_char(), Some('c'));
+        assert_eq!(p.consume_char(), Some('\n'));
+        assert_eq!(p.consume_char(), Some('x'));
+    }
+
+    #[test]
+    fn test_consume_while() {
+        let mut p = Parser{input:"af\n oo:bar", pos: 1};
+        assert_eq!(p.consume_while(|x| x != ':'), "foo");
+        assert_eq!(p.consume_char(), Some(':'));
+        assert_eq!(p.consume_while(|x| x != '\n'), "bar");
+    }
+
+    #[test]
+    fn test_consume_while2() {
+        let mut p = Parser{input:"af\n oo\n\t:bar", pos: 1};
+        assert_eq!(p.consume_while(|x| x != ':'), "foo");
+        assert_eq!(p.consume_char(), Some(':'));
+        assert_eq!(p.consume_while(|x| x != '\n'), "bar");
+    }
+
+    #[test]
+    fn test_consume_while3() {
+        let mut p = Parser{input:"af\n oo:\n bar", pos: 1};
+        assert_eq!(p.consume_while(|x| x != ':'), "foo");
+        assert_eq!(p.consume_char(), Some(':'));
+        assert_eq!(p.consume_while(|x| x != '\n'), "bar");
+    }
+
+    #[test]
+    fn test_consume_only_char() {
+        let mut p = Parser{input:"\n \"bar", pos: 0};
+        assert!(p.consume_only_char('"'));
+        assert_eq!(p.pos, 3);
+        assert!(!p.consume_only_char('"'));
+        assert_eq!(p.pos, 3);
+        assert!(p.consume_only_char('b'));
+        assert_eq!(p.pos, 4);
+    }
+
+}
+
diff --git a/tests/lib.rs b/tests/lib.rs
index ea5c499..d5e4f24 100644
--- a/tests/lib.rs
+++ b/tests/lib.rs
@@ -40,6 +40,8 @@ fn test_line_cont() {
         VERSION:2.1\n\
         N;ENCODING=QUOTED-PRINTABLE:Nikdo;Nikdo=\n\t\
         vic\n\
+        FN;ENCODING=QUOTED-PRINT\n \
+        ABLE:Alice;Alice=vic\n\
         NOTE:This ends with equal sign=\n\
         TEL;WORK:5555\n \
         4444\n\
@@ -48,6 +50,7 @@ fn test_line_cont() {
     assert_eq!(item.name, s!("VCARD"));
     assert_eq!(item.single_prop("TEL").unwrap().raw_value, s!("55554444"));
     assert_eq!(item.single_prop("N").unwrap().raw_value, s!("Nikdo;Nikdo=vic"));
+    assert_eq!(item.single_prop("FN").unwrap().raw_value, s!("Alice;Alice=vic"));
 }
 
 #[test]
@@ -81,6 +84,38 @@ fn test_icalendar_basic() {
 }
 
 #[test]
+fn test_icalendar_multline() {
+    // Adapted from a very popular provider's export
+    // this used to give ParseError { desc: "Expected :, found \n" }
+    let event = parse_component(
+        "BEGIN:VEVENT\n\
+        ATTENDEE;CUTYPE=INDIVIDUAL;ROLE=REQ-PARTICIPANT;PARTSTAT=ACCEPTED;CN=Jo\n \
+        hn Doe;X-NUM-GUESTS=0:mailto:jd@cal.test\n\
+        SUMMARY:Important meeting\n\
+        END:VEVENT\n").unwrap();
+
+    assert_eq!(event.name, s!("VEVENT"));
+    assert_eq!(event.single_prop("SUMMARY").unwrap().raw_value,
+               s!("Important meeting"));
+}
+
+#[test]
+fn test_icalendar_multline2() {
+    // Adapted from a very popular provider's export
+    // this used to give ParseError { desc: "No property name found." }
+    let event = parse_component(
+        "BEGIN:VCALENDAR\n\
+        BEGIN:VEVENT\n\
+        ATTENDEE;CUTYPE=INDIVIDUAL;ROLE=REQ-PARTICIPANT;PARTSTAT=ACCEPTED;CN=Jo\n \
+        hn Doe;X-NUM-GUESTS=0:mailto:jd@cal.test\n\
+        SUMMARY:Important meeting\n\
+        END:VEVENT\n\
+        END:VCALENDAR\n").unwrap();
+
+    assert_eq!(event.name, s!("VCALENDAR"));
+}
+
+#[test]
 fn test_escaping() {
     let item = parse_component(
             "BEGIN:VCALENDAR\n\
author	Markus Unterwaditzer <markus@unterwaditzer.net>	2016-02-20 19:32:30 +0100
committer	Markus Unterwaditzer <markus@unterwaditzer.net>	2016-02-20 19:32:30 +0100
commit	8d4971ef24b34d67ac91152aa6048de1d75362b6 (patch)
tree	0282ab8afa9d8725f1d293e5493cc85f6c01ab4c
parent	10b30e94ba8de939e052839f9b2a4a832521721e (diff)
parent	0cb9d2c3987fae3119f59b7c3ef74f9596f355e2 (diff)