From 4b75bd363de3f3429c62359be4e4aa4da7fa233b Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 18 May 2020 15:07:13 -0400 Subject: efi/libstub: Add UTF-8 decoding to efi_puts In order to be able to use the UTF-16 support added to vsprintf in the previous commit, enhance efi_puts to decode UTF-8 into UTF-16. Invalid UTF-8 encodings are passed through unchanged. Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20200518190716.751506-22-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/efi-stub-helper.c | 67 ++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 5 deletions(-) (limited to 'drivers/firmware/efi') diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index f338d149aaa5..0d0007355c1e 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -36,17 +36,74 @@ void efi_char16_puts(efi_char16_t *str) output_string, str); } +static +u32 utf8_to_utf32(const u8 **s8) +{ + u32 c32; + u8 c0, cx; + size_t clen, i; + + c0 = cx = *(*s8)++; + /* + * The position of the most-significant 0 bit gives us the length of + * a multi-octet encoding. + */ + for (clen = 0; cx & 0x80; ++clen) + cx <<= 1; + /* + * If the 0 bit is in position 8, this is a valid single-octet + * encoding. If the 0 bit is in position 7 or positions 1-3, the + * encoding is invalid. + * In either case, we just return the first octet. + */ + if (clen < 2 || clen > 4) + return c0; + /* Get the bits from the first octet. */ + c32 = cx >> clen--; + for (i = 0; i < clen; ++i) { + /* Trailing octets must have 10 in most significant bits. */ + cx = (*s8)[i] ^ 0x80; + if (cx & 0xc0) + return c0; + c32 = (c32 << 6) | cx; + } + /* + * Check for validity: + * - The character must be in the Unicode range. + * - It must not be a surrogate. + * - It must be encoded using the correct number of octets. + */ + if (c32 > 0x10ffff || + (c32 & 0xf800) == 0xd800 || + clen != (c32 >= 0x80) + (c32 >= 0x800) + (c32 >= 0x10000)) + return c0; + *s8 += clen; + return c32; +} + void efi_puts(const char *str) { efi_char16_t buf[128]; size_t pos = 0, lim = ARRAY_SIZE(buf); + const u8 *s8 = (const u8 *)str; + u32 c32; - while (*str) { - if (*str == '\n') + while (*s8) { + if (*s8 == '\n') buf[pos++] = L'\r'; - /* Cast to unsigned char to avoid sign-extension */ - buf[pos++] = (unsigned char)(*str++); - if (*str == '\0' || pos >= lim - 2) { + c32 = utf8_to_utf32(&s8); + if (c32 < 0x10000) { + /* Characters in plane 0 use a single word. */ + buf[pos++] = c32; + } else { + /* + * Characters in other planes encode into a surrogate + * pair. + */ + buf[pos++] = (0xd800 - (0x10000 >> 10)) + (c32 >> 10); + buf[pos++] = 0xdc00 + (c32 & 0x3ff); + } + if (*s8 == '\0' || pos >= lim - 2) { buf[pos] = L'\0'; efi_char16_puts(buf); pos = 0; -- cgit v1.2.3