From 8926fab3cb78d4324fb63c480e641bd9dab30ec4 Mon Sep 17 00:00:00 2001 From: Justus Winter Date: Thu, 17 Sep 2020 18:05:35 +0200 Subject: openpgp: Make conventional User ID documentation available. - Move the documentation, fix some formatting to prevent automatic escaping and mangling of the grammar. - Add some links to the various methods. - Fixes #558. --- openpgp/src/packet/userid.rs | 332 +++++++++++++++++++++++++------------------ 1 file changed, 197 insertions(+), 135 deletions(-) (limited to 'openpgp') diff --git a/openpgp/src/packet/userid.rs b/openpgp/src/packet/userid.rs index c6acab51..b583260c 100644 --- a/openpgp/src/packet/userid.rs +++ b/openpgp/src/packet/userid.rs @@ -17,136 +17,6 @@ use crate::Packet; use crate::Error; /// A conventionally parsed UserID. -/// -/// Informally, conventional UserIDs are of the form: -/// -/// - First Last (Comment) -/// - First Last -/// - First Last -/// - name@example.org -/// - -/// - name@example.org -/// -/// - Name (Comment) -/// - Name (Comment) -/// - Name -/// - -/// - scheme://hostname/path -/// -/// Names consist of UTF-8 non-control characters and may include -/// punctuation. For instance, the following names are valid: -/// -/// - Acme Industries, Inc. -/// - Michael O'Brian -/// - Smith, John -/// - e.e. cummings -/// -/// (Note: according to RFC 2822 and its successors, all of these -/// would need to be quoted. Conventionally, no implementation quotes -/// names.) -/// -/// Conventional User IDs are UTF-8. RFC 2822 only covers US-ASCII -/// and allows character set switching using RFC 2047. For example, -/// an RFC 2822 parser would parse: -/// -/// - Bj=?utf-8?q?=C3=B6?=rn Bj=?utf-8?q?=C3=B6?=rnson -/// -/// "Björn Björnson". Nobody uses this in practice, and, as such, -/// this extension is not supported by this parser. -/// -/// Comments can include any UTF-8 text except parentheses. Thus, the -/// following is not a valid comment even though the parentheses are -/// balanced: -/// -/// - (foo (bar)) -/// -/// URIs -/// ---- -/// -/// The URI parser recognizes URIs using a regular expression similar -/// to the one recommended in [RFC 3986] with the following extensions -/// and restrictions: -/// -/// - UTF-8 characters are in the range \u{80}-\u{10ffff} are -/// allowed wherever percent-encoded characters are allowed (i.e., -/// everywhere but the schema). -/// -/// - The scheme component and its trailing ":" are required. -/// -/// - The URI must have an authority component ("//domain") or a -/// path component ("/path/to/resource"). -/// -/// - Although the RFC does not allow it, in practice, the '[' and -/// ']' characters are allowed wherever percent-encoded characters -/// are allowed (i.e., everywhere but the schema). -/// -/// URIs are neither normalized nor interpreted. For instance, dot -/// segments are not removed, escape sequences are not decoded, etc. -/// -/// Note: the recommended regular expression is less strict than the -/// grammar. For instance, a percent encoded character must consist -/// of three characters: the percent character followed by two hex -/// digits. The parser that we use does not enforce this either. -/// -/// [RFC 3986]: https://tools.ietf.org/html/rfc3986 -/// -/// Formal Grammar -/// -------------- -/// -/// Formally, the following grammar is used to decompose a User ID: -/// -/// WS = 0x20 (space character) -/// -/// comment-specials = "<" / ">" / ; RFC 2822 specials - "(" and ")" -/// "[" / "]" / -/// ":" / ";" / -/// "@" / "\" / -/// "," / "." / -/// DQUOTE -/// -/// atext-specials = "(" / ")" / ; RFC 2822 specials - "<" and ">". -/// "[" / "]" / -/// ":" / ";" / -/// "@" / "\" / -/// "," / "." / -/// DQUOTE -/// -/// atext = ALPHA / DIGIT / ; Any character except controls, -/// "!" / "#" / ; SP, and specials. -/// "$" / "%" / ; Used for atoms -/// "&" / "'" / -/// "*" / "+" / -/// "-" / "/" / -/// "=" / "?" / -/// "^" / "_" / -/// "`" / "{" / -/// "|" / "}" / -/// "~" / -/// \u{80}-\u{10ffff} ; Non-ascii, non-control UTF-8 -/// -/// dot_atom_text = 1*atext *("." *atext) -/// -/// name-char-start = atext / atext-specials -/// -/// name-char-rest = atext / atext-specials / WS -/// -/// name = name-char-start *name-char-rest -/// -/// comment-char = atext / comment-specials / WS -/// -/// comment-content = *comment-char -/// -/// comment = "(" *WS comment-content *WS ")" -/// -/// addr-spec = dot-atom-text "@" dot-atom-text -/// -/// uri = See [RFC 3986] and the note on URIs above. -/// -/// pgp-uid-convention = addr-spec / -/// uri / -/// *WS [name] *WS [comment] *WS "<" addr-spec ">" / -/// *WS [name] *WS [comment] *WS "<" uri ">" / -/// *WS name *WS [comment] *WS #[derive(Clone, Debug)] pub struct ConventionallyParsedUserID { userid: String, @@ -428,9 +298,167 @@ impl ConventionallyParsedUserID { /// Holds a UserID packet. /// -/// See [Section 5.11 of RFC 4880] for details. +/// The standard imposes no structure on UserIDs, but suggests to +/// follow [RFC 2822]. See [Section 5.11 of RFC 4880] for details. +/// In practice though, implementations do not follow [RFC 2822], or +/// do not even help their users in producing well-formed User IDs. +/// Experience has shown that parsing User IDs using [RFC 2822] does +/// not work, so we are taking a more pragmatic approach and define +/// what we call *Conventional User IDs*. /// +/// [RFC 2822]: https://tools.ietf.org/html/rfc2822 /// [Section 5.11 of RFC 4880]: https://tools.ietf.org/html/rfc4880#section-5.11 +/// +/// Using this definition, we provide methods to extract the [name], +/// [comment], [email address], or [URI] from `UserID` packets. +/// Furthermore, we provide a way to [canonicalize the email address] +/// found in a `UserID` packet. we provide [two] [constructors] that +/// create well-formed User IDs from email address, and optional name +/// and comment. +/// +/// [name]: #method.name +/// [comment]: #method.comment +/// [email address]: #method.email +/// [URI]: #method.uri +/// [canonicalize the email address]: #method.email_normalized +/// [two]: #method.from_address +/// [constructors]: #method.from_unchecked_address +/// +/// # Conventional User IDs +/// +/// Informally, conventional User IDs are of the form: +/// +/// - `First Last (Comment) ` +/// - `First Last ` +/// - `First Last` +/// - `name@example.org ` +/// - `` +/// - `name@example.org` +/// +/// - `Name (Comment) ` +/// - `Name (Comment) ` +/// - `Name ` +/// - `` +/// - `scheme://hostname/path` +/// +/// Names consist of UTF-8 non-control characters and may include +/// punctuation. For instance, the following names are valid: +/// +/// - `Acme Industries, Inc.` +/// - `Michael O'Brian` +/// - `Smith, John` +/// - `e.e. cummings` +/// +/// (Note: according to [RFC 2822] and its successors, all of these +/// would need to be quoted. Conventionally, no implementation quotes +/// names.) +/// +/// Conventional User IDs are UTF-8. [RFC 2822] only covers US-ASCII +/// and allows character set switching using [RFC 2047]. For example, +/// an [RFC 2822] parser would parse: +/// +/// - Bj=?utf-8?q?=C3=B6?=rn Bj=?utf-8?q?=C3=B6?=rnson +/// +/// [RFC 2047]: https://tools.ietf.org/html/rfc2047 +/// +/// "Björn Björnson". Nobody uses this in practice, and, as such, +/// this extension is not supported by this parser. +/// +/// Comments can include any UTF-8 text except parentheses. Thus, the +/// following is not a valid comment even though the parentheses are +/// balanced: +/// +/// - `(foo (bar))` +/// +/// URIs +/// ---- +/// +/// The URI parser recognizes URIs using a regular expression similar +/// to the one recommended in [RFC 3986] with the following extensions +/// and restrictions: +/// +/// - UTF-8 characters are in the range `\u{80}-\u{10ffff}` are +/// allowed wherever percent-encoded characters are allowed (i.e., +/// everywhere but the schema). +/// +/// - The scheme component and its trailing `:` are required. +/// +/// - The URI must have an authority component (`//domain`) or a +/// path component (`/path/to/resource`). +/// +/// - Although the RFC does not allow it, in practice, the `[` and +/// `]` characters are allowed wherever percent-encoded characters +/// are allowed (i.e., everywhere but the schema). +/// +/// URIs are neither normalized nor interpreted. For instance, dot +/// segments are not removed, escape sequences are not decoded, etc. +/// +/// Note: the recommended regular expression is less strict than the +/// grammar. For instance, a percent encoded character must consist +/// of three characters: the percent character followed by two hex +/// digits. The parser that we use does not enforce this either. +/// +/// [RFC 3986]: https://tools.ietf.org/html/rfc3986 +/// +/// Formal Grammar +/// -------------- +/// +/// Formally, the following grammar is used to decompose a User ID: +/// +/// ```text +/// WS = 0x20 (space character) +/// +/// comment-specials = "<" / ">" / ; RFC 2822 specials - "(" and ")" +/// "[" / "]" / +/// ":" / ";" / +/// "@" / "\" / +/// "," / "." / +/// DQUOTE +/// +/// atext-specials = "(" / ")" / ; RFC 2822 specials - "<" and ">". +/// "[" / "]" / +/// ":" / ";" / +/// "@" / "\" / +/// "," / "." / +/// DQUOTE +/// +/// atext = ALPHA / DIGIT / ; Any character except controls, +/// "!" / "#" / ; SP, and specials. +/// "$" / "%" / ; Used for atoms +/// "&" / "'" / +/// "*" / "+" / +/// "-" / "/" / +/// "=" / "?" / +/// "^" / "_" / +/// "`" / "{" / +/// "|" / "}" / +/// "~" / +/// \u{80}-\u{10ffff} ; Non-ascii, non-control UTF-8 +/// +/// dot_atom_text = 1*atext *("." *atext) +/// +/// name-char-start = atext / atext-specials +/// +/// name-char-rest = atext / atext-specials / WS +/// +/// name = name-char-start *name-char-rest +/// +/// comment-char = atext / comment-specials / WS +/// +/// comment-content = *comment-char +/// +/// comment = "(" *WS comment-content *WS ")" +/// +/// addr-spec = dot-atom-text "@" dot-atom-text +/// +/// uri = See [RFC 3986] and the note on URIs above. +/// +/// pgp-uid-convention = addr-spec / +/// uri / +/// *WS [name] *WS [comment] *WS "<" addr-spec ">" / +/// *WS [name] *WS [comment] *WS "<" uri ">" / +/// *WS name *WS [comment] *WS +/// ``` pub struct UserID { /// CTB packet header fields. pub(crate) common: packet::Common, @@ -656,8 +684,8 @@ impl UserID { /// Constructs a User ID. /// - /// This does a basic check and any necessary escaping to form a de - /// facto User ID. + /// This does a basic check and any necessary escaping to form a + /// [conventional User ID]. /// /// Only the address is required. If a comment is supplied, then /// a name is also required. @@ -665,6 +693,8 @@ impl UserID { /// If you already have a User ID value, then you can just /// use `UserID::from()`. /// + /// [conventional User ID]: #conventional-user-ids + /// /// ``` /// # extern crate sequoia_openpgp as openpgp; /// # use openpgp::packet::UserID; @@ -686,8 +716,9 @@ impl UserID { /// Constructs a User ID. /// - /// This does a basic check and any necessary escaping to form a de - /// facto User ID modulo the address, which is not checked. + /// This does a basic check and any necessary escaping to form a + /// [conventional User ID] modulo the address, which is not + /// checked. /// /// This is useful when you want to specify a URI instead of an /// email address. @@ -695,6 +726,8 @@ impl UserID { /// If you already have a User ID value, then you can just /// use `UserID::from()`. /// + /// [conventional User ID]: #conventional-user-ids + /// /// ``` /// # extern crate sequoia_openpgp as openpgp; /// # use openpgp::packet::UserID; @@ -715,6 +748,19 @@ impl UserID { } /// Gets the user ID packet's value. + /// + /// This returns the raw, uninterpreted value. See + /// [`UserID::name`], [`UserID::email`], + /// [`UserID::email_normalized`], [`UserID::uri`], and + /// [`UserID::comment`] for how to extract parts of [conventional + /// User ID]s. + /// + /// [`UserID::name`]: #method.name + /// [`UserID::email`]: #method.email + /// [`UserID::email_normalized`]: #method.email_normalized + /// [`UserID::uri`]: #method.uri + /// [`UserID::comment`]: #method.comment + /// [conventional User ID]: #conventional-user-ids pub fn value(&self) -> &[u8] { self.value.as_slice() } @@ -739,6 +785,10 @@ impl UserID { /// Parses the User ID according to de facto conventions, and /// returns the name component, if any. + /// + /// See [conventional User ID] for more information. + /// + /// [conventional User ID]: #conventional-user-ids pub fn name(&self) -> Result> { self.do_parse()?; match *self.parsed.lock().unwrap().borrow() { @@ -749,6 +799,10 @@ impl UserID { /// Parses the User ID according to de facto conventions, and /// returns the comment field, if any. + /// + /// See [conventional User ID] for more information. + /// + /// [conventional User ID]: #conventional-user-ids pub fn comment(&self) -> Result> { self.do_parse()?; match *self.parsed.lock().unwrap().borrow() { @@ -759,6 +813,10 @@ impl UserID { /// Parses the User ID according to de facto conventions, and /// returns the email address, if any. + /// + /// See [conventional User ID] for more information. + /// + /// [conventional User ID]: #conventional-user-ids pub fn email(&self) -> Result> { self.do_parse()?; match *self.parsed.lock().unwrap().borrow() { @@ -769,6 +827,10 @@ impl UserID { /// Parses the User ID according to de facto conventions, and /// returns the URI, if any. + /// + /// See [conventional User ID] for more information. + /// + /// [conventional User ID]: #conventional-user-ids pub fn uri(&self) -> Result> { self.do_parse()?; match *self.parsed.lock().unwrap().borrow() { -- cgit v1.2.3