Fix IMAP UTF-7 for code points >= U+10000.

The 20-year old utf7 conversion functions punted on those values, which was understandable for when they were written. We now have emojis and increasing number of characters that might be used in a mailbox. Add encode/decode logic using UTF-16 surrogate pairs. Thanks to Jeff Sipek for reporting this issue, and for giving pointers about how the values should be handled.
author: Kevin McCarthy <kevin@8t8.us> 2020-11-10 15:56:44 -0800
committer: Kevin McCarthy <kevin@8t8.us> 2020-11-12 14:01:16 -0800
commit: e832240e8e9ccabdcd067553d6f4bf4f567dfaec (patch)
tree: f6a72f12f1730dc14e250ad69ed0030f4ab7ecff /imap
parent: 894a49f62182b5f5b8a7afbf53c4e0b0af2d5ae8 (diff)
1 files changed, 83 insertions, 5 deletions
diff --git a/imap/utf7.c b/imap/utf7.c
index c8d44e88..f40f189a 100644
--- a/imap/utf7.c
+++ b/imap/utf7.c
@@ -16,6 +16,29 @@
  *     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
+/* This file converts between modified UTF-7 and UTF-8.
+ * Modified UTF-7 is described in RFC 3501 section 5.1.3.
+ * Regular UTF-7 is decribed in RFC 2152.
+ *
+ * In modified UTF-7:
+ *   - printable ascii 0x20-0x25 and 0x27-0x7e represents itself.
+ *   - "&" (0x26) is represented by the two-octet sequence "&-"
+ *   - other values use the UTF-16 representation of the code point
+ *     and encode it using a modified version of BASE64.
+ *   - BASE64 mode is enabled by "&" and disabled by "-".
+ *
+ * Note that UTF-16:
+ *   - Represents U+0000-U+D7FF and U+E000-U+FFFF directly as the binary
+ *     2-byte value.
+ *   - Reserves U+D800-U+DFFF (so they aren't valid code points.)
+ *   - Values above U+FFFF need to be encoded using a surrogate pair of
+ *     two 16-bit values:
+ *       - subtract 0x10000 from the code point
+ *       - take the top 10 bits and add 0xd800 to get the first (high) pair.
+ *       - take the bottom 10 bits and add 0xdc00 for the second (low) pair.
+ */
+
+
 #if HAVE_CONFIG_H
 # include "config.h"
 #endif
@@ -58,6 +81,7 @@ static char *utf7_to_utf8 (const char *u7, size_t u7len, char **u8,
 {
   char *buf, *p;
   int b, ch, k;
+  int pair1 = 0;
 
   p = buf = safe_malloc (u7len + u7len / 8 + 1);
 
@@ -101,10 +125,43 @@ static char *utf7_to_utf8 (const char *u7, size_t u7len, char **u8,
 	  }
 	  else
 	  {
-	    *p++ = 0xe0 | (ch >> 12);
-	    *p++ = 0x80 | ((ch >> 6) & 0x3f);
-	    *p++ = 0x80 | (ch & 0x3f);
+            /* High surrogate pair */
+            if ((ch & ~0x3ff) == 0xd800)
+            {
+              if (pair1)
+                goto bail;
+              pair1 = ch;
+            }
+            else
+            {
+              /* Low surrogate pair */
+              if ((ch & ~0x3ff) == 0xdc00)
+              {
+                if (!pair1)
+                  goto bail;
+
+                ch = ((pair1 - 0xd800) << 10) + (ch - 0xdc00) + 0x10000;
+                pair1 = 0;
+              }
+              if (pair1)
+                goto bail;
+
+              if (ch < 0x10000)
+              {
+                *p++ = 0xe0 | (ch >> 12);
+                *p++ = 0x80 | ((ch >> 6) & 0x3f);
+                *p++ = 0x80 | (ch & 0x3f);
+              }
+              else
+              {
+                *p++ = 0xf0 | (ch >> 18);
+                *p++ = 0x80 | ((ch >> 12) & 0x3f);
+                *p++ = 0x80 | ((ch >> 6) & 0x3f);
+                *p++ = 0x80 | (ch & 0x3f);
+              }
+            }
 	  }
+
 	  ch = (b << (16 + k)) & 0xffff;
 	  k += 10;
 	}
@@ -143,7 +200,7 @@ bail:
  * Convert the data (u8,u8len) from UTF-8 to RFC 2060's UTF-7.
  * The result is null-terminated and returned, and also stored
  * in (*u7,*u7len) if u7 or u7len is non-zero.
- * Unicode characters above U+FFFF are replaced by U+FFFE.
+ * Unicode characters above U+FFFF converted to a UTF-16 surrogate pair.
  * If input data is invalid, return 0 and don't store anything.
  */
 static char *utf8_to_utf7 (const char *u8, size_t u8len, char **u7,
@@ -203,8 +260,29 @@ static char *utf8_to_utf7 (const char *u8, size_t u8len, char **u7,
 	b = 0;
 	k = 10;
       }
+
+      /* For code points >= 0x10000 we need to use a UTF-16 surrogate pair.
+       */
       if (ch & ~0xffff)
-	ch = 0xfffe;
+      {
+        int pair1, pair2;
+
+        ch -= 0x10000;
+        pair1 = 0xd800 + (ch >> 10);
+        pair2 = 0xdc00 + (ch & 0x3ff);
+
+        /* Output the high surrogate */
+        *p++ = B64Chars[b | pair1 >> k];
+        k -= 6;
+        for (; k >= 0; k -= 6)
+          *p++ = B64Chars[(pair1 >> k) & 0x3f];
+        b = (pair1 << (-k)) & 0x3f;
+        k += 16;
+
+        /* The low surrogate will be output just below */
+        ch = pair2;
+      }
+
       *p++ = B64Chars[b | ch >> k];
       k -= 6;
       for (; k >= 0; k -= 6)
author	Kevin McCarthy <kevin@8t8.us>	2020-11-10 15:56:44 -0800
committer	Kevin McCarthy <kevin@8t8.us>	2020-11-12 14:01:16 -0800
commit	e832240e8e9ccabdcd067553d6f4bf4f567dfaec (patch)
tree	f6a72f12f1730dc14e250ad69ed0030f4ab7ecff /imap
parent	894a49f62182b5f5b8a7afbf53c4e0b0af2d5ae8 (diff)