From 9e867bfd779894d602487d4fbaabe5d63d94cb52 Mon Sep 17 00:00:00 2001 From: Georges Racinet Date: Sat, 20 Feb 2016 16:10:41 +0100 Subject: Generalized line unfolding in parser (issue #2) Ignoring (CR)LF followed by a single whitespace char is now done in peek() and therefore is enforced in all processing. --- src/vobject/lib.rs | 146 +++++++++++++++++++++++++++++++++++++++++++---------- tests/lib.rs | 34 +++++++++++++ 2 files changed, 154 insertions(+), 26 deletions(-) diff --git a/src/vobject/lib.rs b/src/vobject/lib.rs index 90343c4..4d931fe 100644 --- a/src/vobject/lib.rs +++ b/src/vobject/lib.rs @@ -127,8 +127,37 @@ impl<'s> Parser<'s> { } } - fn peek(&self) -> Option { - self.input[self.pos..].chars().next() + /// look-ahead for next char at given offset from current position + /// (self.pos), taking [line unfolding] + /// (https://tools.ietf.org/html/rfc5545#section-3.1) into account, + /// without actually + /// consuming it (immutable self). + /// + /// Return an option for next char, and needed increment to consume it + /// from current position. + /// CR characters get always skipped, resulting in CRLF to be simplified as + /// LF, which seems to be acceptable because + /// - the remainders of the lib do accept a lone LF as a line termination + /// (a bit laxer than RFC 5545) + /// - CR alone [is not acceptable content] + /// (https://tools.ietf.org/html/rfc5545#section-3.1) + fn peek_at(&self, at: usize) -> Option<(char, usize)> { + match self.input[self.pos+at..].chars().next() { + None => None, + Some('\r') => self.peek_at(at + 1), + Some('\n') => { + match self.peek_at(at + 1) { + Some((' ', offset)) | Some(('\t', offset)) => + self.peek_at(offset), + _ => Some(('\n', at + 1)) + } + }, + Some(x) => { Some((x, at + x.len_utf8())) } + } + } + + fn peek(&self) -> Option<(char, usize)> { + self.peek_at(0) } pub fn eof(&self) -> bool { @@ -137,7 +166,7 @@ impl<'s> Parser<'s> { fn assert_char(&self, c: char) -> ParseResult<()> { let real_c = match self.peek() { - Some(x) => x, + Some((x, _)) => x, None => return Err(ParseError::new(format!("Expected {}, found EOL", c))), }; @@ -149,8 +178,8 @@ impl<'s> Parser<'s> { } fn consume_char(&mut self) -> Option { - match self.peek() { - Some(x) => { self.pos += x.len_utf8(); Some(x) }, + match self.peek_at(0) { + Some((c, offset)) => { self.pos += offset; Some(c) }, None => None } } @@ -185,12 +214,39 @@ impl<'s> Parser<'s> { Ok(()) } - fn consume_while<'a, F: Fn(char) -> bool>(&'a mut self, test: F) -> &'a str { - let start_pos = self.pos; - while !self.eof() && test(self.peek().unwrap()) { - self.consume_char(); + // GR this used to return just a slice from input, but line unfolding + // makes it contradictory, unless one'd want to rescan everything. + // Since actually useful calls used to_owned() on the result, which + // does copy into a String's buffer, let's create a String right away + // implementation detail : instead of pushing char after char, we + // do it by the biggest contiguous slices possible, because I believe it + // to be more efficient (less checks for reallocation etc). + fn consume_while<'a, F: Fn(char) -> bool>(&'a mut self, test: F) -> String { + let mut sl_start_pos = self.pos; + let mut res = String::new(); + while !self.eof() { + match self.peek() { + Some((c, offset)) => { + if !test(c) { + break + } else { + if offset > c.len_utf8() { + // we have some skipping and therefore need to flush + res.push_str(&self.input[sl_start_pos..self.pos]); + res.push(c); + sl_start_pos = self.pos + offset; + } + self.pos += offset; + } + }, + _ => break + } } - &self.input[start_pos..self.pos] + // Final flush + if sl_start_pos < self.pos { + res.push_str(&self.input[sl_start_pos..self.pos]) + } + res } pub fn consume_property(&mut self) -> ParseResult { @@ -216,7 +272,7 @@ impl<'s> Parser<'s> { if rv.len() == 0 { Err(ParseError::new("No property name found.")) } else { - Ok(rv.to_owned()) + Ok(rv) } } @@ -240,16 +296,8 @@ impl<'s> Parser<'s> { } fn consume_property_value<'a>(&'a mut self) -> ParseResult { - let mut rv = String::new(); - loop { - rv.push_str(self.consume_while(|x| x != '\r' && x != '\n')); - try!(self.sloppy_terminate_line()); - - match self.peek() { - Some(' ') | Some('\t') => self.consume_char(), - _ => break, - }; - } + let rv = self.consume_while(|x| x != '\r' && x != '\n'); + try!(self.sloppy_terminate_line()); Ok(rv) } @@ -269,20 +317,20 @@ impl<'s> Parser<'s> { x > '\u{1F}' }; - if self.peek() == Some('"') { + if self.peek() == Some(('"', 1)) { self.consume_char(); - let rv = self.consume_while(qsafe).to_owned(); + let rv = self.consume_while(qsafe); try!(self.assert_char('"')); self.consume_char(); Ok(rv) } else { - Ok(self.consume_while(|x| qsafe(x) && x != ';' && x != ':').to_owned()) + Ok(self.consume_while(|x| qsafe(x) && x != ';' && x != ':')) } } fn consume_param<'a>(&'a mut self) -> ParseResult<(String, String)> { let name = try!(self.consume_param_name()); - let value = if self.peek() == Some('=') { + let value = if self.peek() == Some(('=', 1)) { let start_pos = self.pos; self.consume_char(); match self.consume_param_value() { @@ -298,7 +346,7 @@ impl<'s> Parser<'s> { fn consume_params(&mut self) -> HashMap { let mut rv: HashMap = HashMap::new(); - while self.peek() == Some(';') { + while self.peek() == Some((';', 1)) { self.consume_char(); match self.consume_param() { Ok((name, value)) => { rv.insert(name.to_owned(), value.to_owned()); }, @@ -470,3 +518,49 @@ impl ParseError { self.desc } } + +#[cfg(test)] +mod tests { + use super::Parser; + + #[test] + fn test_unfold1() { + let mut p = Parser{input: "ab\r\n c", pos: 2}; + assert_eq!(p.consume_char(), Some('c')); + assert_eq!(p.pos, 6); + } + + #[test] + fn test_unfold2() { + let mut p = Parser{input: "ab\n\tc\nx", pos: 2}; + assert_eq!(p.consume_char(), Some('c')); + assert_eq!(p.consume_char(), Some('\n')); + assert_eq!(p.consume_char(), Some('x')); + } + + #[test] + fn test_consume_while() { + let mut p = Parser{input:"af\n oo:bar", pos: 1}; + assert_eq!(p.consume_while(|x| x != ':'), "foo"); + assert_eq!(p.consume_char(), Some(':')); + assert_eq!(p.consume_while(|x| x != '\n'), "bar"); + } + + #[test] + fn test_consume_while2() { + let mut p = Parser{input:"af\n oo\n\t:bar", pos: 1}; + assert_eq!(p.consume_while(|x| x != ':'), "foo"); + assert_eq!(p.consume_char(), Some(':')); + assert_eq!(p.consume_while(|x| x != '\n'), "bar"); + } + + #[test] + fn test_consume_while3() { + let mut p = Parser{input:"af\n oo:\n bar", pos: 1}; + assert_eq!(p.consume_while(|x| x != ':'), "foo"); + assert_eq!(p.consume_char(), Some(':')); + assert_eq!(p.consume_while(|x| x != '\n'), "bar"); + } + +} + diff --git a/tests/lib.rs b/tests/lib.rs index ea5c499..4f5e998 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -40,6 +40,8 @@ fn test_line_cont() { VERSION:2.1\n\ N;ENCODING=QUOTED-PRINTABLE:Nikdo;Nikdo=\n\t\ vic\n\ + FN;ENCODING=QUOTED-PRINT\n \ + ABLE:Alice;Alice=vic\n\ NOTE:This ends with equal sign=\n\ TEL;WORK:5555\n \ 4444\n\ @@ -48,6 +50,7 @@ fn test_line_cont() { assert_eq!(item.name, s!("VCARD")); assert_eq!(item.single_prop("TEL").unwrap().raw_value, s!("55554444")); assert_eq!(item.single_prop("N").unwrap().raw_value, s!("Nikdo;Nikdo=vic")); + assert_eq!(item.single_prop("FN").unwrap().raw_value, s!("Alice;Alice=vic")); } #[test] @@ -80,6 +83,37 @@ fn test_icalendar_basic() { assert_eq!(event.single_prop("LOCATION").unwrap().raw_value, s!("Somewhere")); } +#[test] +fn test_icalendar_multline() { + // Adapted from a very popular provider's export + // this used to give ParseError { desc: "Expected :, found \n" } + let event = parse_component( + "BEGIN:VEVENT\n\ + ATTENDEE;CUTYPE=INDIVIDUAL;ROLE=REQ-PARTICIPANT;PARTSTAT=ACCEPTED;CN=Jo\n \ + hn Doe;X-NUM-GUESTS=0:mailto:jd@cal.test\n\ + SUMMARY:Important meeting\n\ + END:VEVENT\n").unwrap(); + + assert_eq!(event.name, s!("VEVENT")); + assert_eq!(event.single_prop("SUMMARY").unwrap().raw_value, + s!("Important meeting")); +} + +#[test] +fn test_icalendar_multline2() { + // Adapted from a very popular provider's export + // this used to give ParseError { desc: "No property name found." } + let event = parse_component( + "BEGIN:VCALENDAR\n\ + BEGIN:VEVENT\n\ + ATTENDEE;CUTYPE=INDIVIDUAL;ROLE=REQ-PARTICIPANT;PARTSTAT=ACCEPTED;CN=Jo\n \ + hn Doe;X-NUM-GUESTS=0:mailto:jd@cal.test\n\ + SUMMARY:Important meeting\n\ + END:VEVENT\n\ + END:VCALENDAR\n").unwrap(); + +} + #[test] fn test_escaping() { let item = parse_component( -- cgit v1.2.3 From c806290562b7832b020cc46960ef538c01a2d15c Mon Sep 17 00:00:00 2001 From: Georges Racinet Date: Sat, 20 Feb 2016 18:13:24 +0100 Subject: Got rid of 'unused' warning --- tests/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/lib.rs b/tests/lib.rs index 4f5e998..d5e4f24 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -112,6 +112,7 @@ fn test_icalendar_multline2() { END:VEVENT\n\ END:VCALENDAR\n").unwrap(); + assert_eq!(event.name, s!("VCALENDAR")); } #[test] -- cgit v1.2.3 From ee01cd35e119c2949eddc1bb957ab1512df4040b Mon Sep 17 00:00:00 2001 From: Georges Racinet Date: Sat, 20 Feb 2016 18:16:45 +0100 Subject: In parser, stopped assuming offset=1 for various delimiters --- src/vobject/lib.rs | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/vobject/lib.rs b/src/vobject/lib.rs index 4d931fe..6ee1dbf 100644 --- a/src/vobject/lib.rs +++ b/src/vobject/lib.rs @@ -184,6 +184,15 @@ impl<'s> Parser<'s> { } } + /// If next peeked char is the given `c`, consume it and return `true`, + /// otherwise return `false`. + fn consume_only_char(&mut self, c: char) -> bool { + match self.peek_at(0) { + Some((d, offset)) if d == c => {self.pos += offset; true}, + _ => false + } + } + fn consume_eol(&mut self) -> ParseResult<()> { let start_pos = self.pos; @@ -317,8 +326,7 @@ impl<'s> Parser<'s> { x > '\u{1F}' }; - if self.peek() == Some(('"', 1)) { - self.consume_char(); + if self.consume_only_char('"') { let rv = self.consume_while(qsafe); try!(self.assert_char('"')); self.consume_char(); @@ -330,9 +338,8 @@ impl<'s> Parser<'s> { fn consume_param<'a>(&'a mut self) -> ParseResult<(String, String)> { let name = try!(self.consume_param_name()); - let value = if self.peek() == Some(('=', 1)) { - let start_pos = self.pos; - self.consume_char(); + let start_pos = self.pos; + let value = if self.consume_only_char('=') { match self.consume_param_value() { Ok(x) => x, Err(e) => { self.pos = start_pos; return Err(e); } @@ -346,8 +353,7 @@ impl<'s> Parser<'s> { fn consume_params(&mut self) -> HashMap { let mut rv: HashMap = HashMap::new(); - while self.peek() == Some((';', 1)) { - self.consume_char(); + while self.consume_only_char(';') { match self.consume_param() { Ok((name, value)) => { rv.insert(name.to_owned(), value.to_owned()); }, Err(_) => break, @@ -562,5 +568,16 @@ mod tests { assert_eq!(p.consume_while(|x| x != '\n'), "bar"); } + #[test] + fn test_consume_only_char() { + let mut p = Parser{input:"\n \"bar", pos: 0}; + assert!(p.consume_only_char('"')); + assert_eq!(p.pos, 3); + assert!(!p.consume_only_char('"')); + assert_eq!(p.pos, 3); + assert!(p.consume_only_char('b')); + assert_eq!(p.pos, 4); + } + } -- cgit v1.2.3 From f1b299799156cb46a59055ab8174d6de6ef37a81 Mon Sep 17 00:00:00 2001 From: Georges Racinet Date: Sat, 20 Feb 2016 18:39:07 +0100 Subject: in parser, don't use peek_at() directly This is for consistency, peek_at() being some kind of internal detail. --- src/vobject/lib.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/vobject/lib.rs b/src/vobject/lib.rs index 6ee1dbf..b66e73f 100644 --- a/src/vobject/lib.rs +++ b/src/vobject/lib.rs @@ -156,6 +156,7 @@ impl<'s> Parser<'s> { } } + #[inline] fn peek(&self) -> Option<(char, usize)> { self.peek_at(0) } @@ -178,7 +179,7 @@ impl<'s> Parser<'s> { } fn consume_char(&mut self) -> Option { - match self.peek_at(0) { + match self.peek() { Some((c, offset)) => { self.pos += offset; Some(c) }, None => None } @@ -187,7 +188,7 @@ impl<'s> Parser<'s> { /// If next peeked char is the given `c`, consume it and return `true`, /// otherwise return `false`. fn consume_only_char(&mut self, c: char) -> bool { - match self.peek_at(0) { + match self.peek() { Some((d, offset)) if d == c => {self.pos += offset; true}, _ => false } -- cgit v1.2.3 From 0cb9d2c3987fae3119f59b7c3ef74f9596f355e2 Mon Sep 17 00:00:00 2001 From: Georges Racinet Date: Sat, 20 Feb 2016 18:41:23 +0100 Subject: Explicit lifetime has become useless --- src/vobject/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vobject/lib.rs b/src/vobject/lib.rs index b66e73f..04ce8e3 100644 --- a/src/vobject/lib.rs +++ b/src/vobject/lib.rs @@ -231,7 +231,7 @@ impl<'s> Parser<'s> { // implementation detail : instead of pushing char after char, we // do it by the biggest contiguous slices possible, because I believe it // to be more efficient (less checks for reallocation etc). - fn consume_while<'a, F: Fn(char) -> bool>(&'a mut self, test: F) -> String { + fn consume_while bool>(&mut self, test: F) -> String { let mut sl_start_pos = self.pos; let mut res = String::new(); while !self.eof() { -- cgit v1.2.3