diff options
author | Thomas Roessler <roessler@does-not-exist.org> | 1999-03-30 23:50:33 +0000 |
---|---|---|
committer | Thomas Roessler <roessler@does-not-exist.org> | 1999-03-30 23:50:33 +0000 |
commit | ec8c796bd17158c54f2415b1e71a82d309687126 (patch) | |
tree | 472eb9f8f428af3c73f39c6e63ea1cebad36facd | |
parent | a784b853587106e064e2c72d2058b88223434a6a (diff) |
This patch removes at least some of the horrible utf-8 kluges in
charset.c. The new DECODER framework is currently only used in
handler.c, and there in a horribly inefficient manner. We should
use greater blocks of data, which would be much more efficient than
what we are currently doing.
Most of the other charset-related code still uses the old
mutt_display_char() &friends interface, which is actually ok as long
as you don't try to handle multibyte character sets.
The most notable change should be the one to mutt_get_translation():
It will delay the loading and parsing of character set information
files until it's really needed, catching a huge amount of standard
cases. As a side effect, this will make "iso tagged as ascii"
"work" again, as long as both sides use the same iso character set.
-rw-r--r-- | charset.c | 281 | ||||
-rw-r--r-- | charset.h | 38 | ||||
-rw-r--r-- | handler.c | 114 |
3 files changed, 297 insertions, 136 deletions
@@ -682,21 +682,30 @@ CHARSET_MAP *mutt_get_translation(const char *_from, const char *_to) if(!_from || !_to) return NULL; - - init_charsets(); canonical_charset(from_canon, sizeof(from_canon), _from); canonical_charset(to_canon, sizeof(to_canon), _to); + /* quick check for some trivial cases. Doing this before + * we actually call the initialization routine delays character + * set loading until it's _really_ needed. + */ + + if(!strcmp(from_canon, to_canon) + || (!strcmp (from_canon, "us-ascii") && !strncmp (to_canon, "iso-8859", 8))) + return NULL; + + init_charsets(); + if(!CharsetAliases || !(from = hash_find(CharsetAliases, from_canon))) from = from_canon; if(!CharsetAliases || !(to = hash_find(CharsetAliases, to_canon))) to = to_canon; /* quick check for the identity mapping */ - if((from == to) || ((*from == *to) && !mutt_strcmp(from, to))) + if((from == to) || !mutt_strcmp(from, to)) return NULL; - + snprintf(key, sizeof(key), "%s %s", from, to); if((map = hash_find(Translations, key)) == NULL) { @@ -812,20 +821,25 @@ static char *utf_to_unicode(int *out, char *in) static CHARSET *Unicode = NULL; -void mutt_decode_utf8_string(char *str, CHARSET *chs) +static int unicode_init (void) { - char *s, *t; - CHARDESC *cd; - int ch; - - /* Hack */ - if (!Unicode) { if (load_charset ("ISO_10646", &Unicode, 1) == -1) Unicode = NULL; } + return (Unicode == NULL ? -1 : 0); +} + +void mutt_decode_utf8_string(char *str, CHARSET *chs) +{ + char *s, *t; + CHARDESC *cd; + int ch; + + (void) unicode_init (); + for (s = t = str; *t; s++) { t = utf_to_unicode(&ch, t); @@ -844,128 +858,188 @@ void mutt_decode_utf8_string(char *str, CHARSET *chs) *s = '\0'; } -/* internal use only */ -struct utf8_state -{ - char *buffer; - size_t blen; - size_t bp; -}; -static struct utf8_state *new_utf8_state (void) + +/************************************************************* + * General decoder framework + */ + + + +#define MIN(a,b) (((a) <= (b)) ? (a): (b)) + +DECODER *mutt_open_decoder (const char *src, const char *dest) { - return safe_calloc (1, sizeof (struct utf8_state)); + DECODER *d = safe_calloc (1, sizeof (DECODER));; + + d->in.size = DECODER_BUFFSIZE; + d->out.size = DECODER_BUFFSIZE; + + if (!src || !dest || mutt_is_utf8 (dest)) + { + d->just_take_id = 1; + return d; + } + + if (mutt_is_utf8 (src)) + { + if (!(d->chs = mutt_get_charset (dest)) || unicode_init () == -1) + { + d->just_take_id = 1; + return d; + } + + d->src_is_utf8 = 1; + return d; + } + + if (!(d->chm = mutt_get_translation (src, dest))) + d->just_take_id = 1; + + return d; } -static void free_utf8_state (struct utf8_state **sp) +void mutt_free_decoder (DECODER **dpp) { - if (!sp || !*sp) return; - safe_free ((void **) &(*sp)->buffer); - safe_free ((void **) sp); + safe_free ((void **) dpp); } -static void _state_utf8_flush(STATE *s, CHARSET *chs, struct utf8_state *sfu) +static void _process_data (DECODER *, short); + +void mutt_decoder_push (DECODER *d, void *_buff, size_t blen, size_t *taken) { - char *t; - if(!sfu->buffer || !sfu->bp) - return; - - sfu->buffer[sfu->bp] = '\0'; - - mutt_decode_utf8_string(sfu->buffer, chs); - for(t = sfu->buffer; *t; t++) + if (!_buff || !blen) { - /* This may lead to funny-looking output if - * there are embedded CRs, NLs or similar things - * - but these would constitute illegal - * UTF8 encoding anyways, so we don't care. - */ + _process_data (d, 1); + return; + } - state_prefix_putc(*t, s); + if ((*taken = MIN(blen, d->in.size - d->in.used))) + { + memcpy (d->in.buff + d->in.used, _buff, *taken); + d->in.used += *taken; } - sfu->bp = 0; } - -static void state_fput_utf8(STATE *st, char u, CHARSET *chs, struct utf8_state *sfu) + + +void mutt_decoder_pop (DECODER *d, void *_buff, size_t blen, size_t *popped) { - if((u & 0x80) == 0 || (sfu->bp && (u & IIOOOOOO) != IOOOOOOO)) - _state_utf8_flush(st, chs, sfu); - - if((u & 0x80) == 0) + unsigned char *buff = _buff; + + _process_data (d, 0); + + if ((*popped = MIN (blen, d->out.used))) { - if(u) state_prefix_putc(u, st); + memcpy (buff, d->out.buff, *popped); + memmove (d->out.buff, d->out.buff + *popped, d->out.used - *popped); + d->out.used -= *popped; } - else +} + +void mutt_decoder_pop_to_state (DECODER *d, STATE *s) +{ + char tmp[DECODER_BUFFSIZE]; + size_t i, l; + + do { - if(sfu->bp + 1 >= sfu->blen) - { - sfu->blen = (sfu->blen + 80) * 2; - safe_realloc((void **) &sfu->buffer, sfu->blen + 1); - } - sfu->buffer[sfu->bp++] = u; + mutt_decoder_pop (d, tmp, sizeof (tmp), &l); + for (i = 0; i < l; i++) + state_prefix_putc (tmp[i], s); } + while (l > 0); } -/* a nicer interface for decoding */ +/* this is where things actually happen */ -DECODER *mutt_open_decoder (STATE *s, BODY *b, int istext) +static void _process_data_8bit (DECODER *d) { - DECODER *dp = safe_calloc (1, sizeof (DECODER)); + size_t i; - dp->s = s; + for (i = 0; i < d->in.used && d->out.used < d->out.size; i++) + d->out.buff[d->out.used++] = mutt_display_char (d->in.buff[i], d->chm); - if (istext && (s->flags & M_CHARCONV)) - { - char *charset = mutt_get_parameter ("charset", b->parameter); - dp->is_utf8 = mutt_is_utf8 (charset) && !mutt_is_utf8 (Charset); - - if (dp->is_utf8) - { - dp->sfu = new_utf8_state (); - dp->chs = mutt_get_charset (Charset); - } - else - dp->map = mutt_get_translation (charset, Charset); - } - - return dp; + memmove (d->in.buff, d->in.buff + i, d->in.used - i); + d->in.used -= i; } -void mutt_close_decoder (DECODER **dpp) +static void _process_data_utf8 (DECODER *d) { - if (!dpp || !*dpp) - return; + size_t i, j; + CHARDESC *cd; - if ((*dpp)->is_utf8) + for (i = 0, j = 0; i < d->in.used && d->out.used < d->out.size;) { - _state_utf8_flush ((*dpp)->s, (*dpp)->chs, (*dpp)->sfu); - free_utf8_state (&(*dpp)->sfu); + while (((d->in.buff[j] & 0x80) == 0) && (j < d->in.used) && (d->out.used < d->out.size)) + d->out.buff[d->out.used++] = d->in.buff[j++]; + i = j; + + while ((d->in.buff[j] & 0x80) && j < d->in.used && + (d->forced || j + 6 < d->in.used) && d->out.used < d->out.size) + { + int ch; + char *c = utf_to_unicode (&ch, &d->in.buff[j]); + + j = c - d->in.buff; + + if (0 <= ch && ch < 128) + d->out.buff[d->out.used] = ch; + else if ((cd = repr2descr (ch, Unicode)) && (ch = translate_character (d->chs, cd->symbol)) != -1) + d->out.buff[d->out.used] = ch; + else + d->out.buff[d->out.used] = '?'; + + if(!d->out.buff[d->out.used]) + d->out.buff[d->out.used] = '?'; + + d->out.used++; + } + + i = j; + + if (d->in.buff[j] & 0x80) + break; } - safe_free ((void **) dpp); + memmove (d->in.buff, d->in.buff + i, d->in.used - i); + d->in.used -= i; } -void mutt_decoder_putc (DECODER *dp, char c) +static void _process_data (DECODER *d, short force) { - if (dp->is_utf8) - state_fput_utf8 (dp->s, c, dp->chs, dp->sfu); + if (force) d->forced = 1; + + if (d->just_take_id) + { + size_t l = MIN (d->out.size - d->out.used, d->in.used); + memmove (d->out.buff + d->out.used, d->in.buff, l); + memmove (d->in.buff, d->in.buff + l, d->in.used - l); + d->in.used -= l; + d->out.used += l; + } + else if (d->src_is_utf8) + _process_data_utf8 (d); else - state_prefix_putc (mutt_display_char ((unsigned char) c, dp->map), dp->s); + _process_data_8bit (d); } -/* FIXME: utf-8 support */ +/* This one is currently lacking utf-8 support */ int mutt_recode_file (const char *fname, const char *src, const char *dest) { FILE *fp, *tmpfp; char tempfile[_POSIX_PATH_MAX]; + char buffer[1024]; + char tmp[1024]; int c; int rv = -1; - - CHARSET_MAP *map; - if (mutt_is_utf8 (dest) ^ mutt_is_utf8(src)) + size_t lf, lpu, lpo; + char *t; + DECODER *dec; + + if (mutt_is_utf8 (dest) && !mutt_is_utf8 (src)) { mutt_error (_("We can't currently handle utf-8 at this point.")); return -1; @@ -985,11 +1059,34 @@ int mutt_recode_file (const char *fname, const char *src, const char *dest) return -1; } - map = mutt_get_translation (src, dest); + dec = mutt_open_decoder (src, dest); - while ((c = fgetc (fp)) != EOF) - if (fputc (mutt_display_char ((unsigned char) c, map), tmpfp) == EOF) - goto bail; + while ((lf = fread (buffer, 1, sizeof (buffer), fp)) > 0) + { + for (t = buffer; lf; t += lpu) + { + mutt_decoder_push (dec, t, lf, &lpu); + lf -= lpu; + + do + { + mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo); + if (lpo) + fwrite (tmp, lpo, 1, tmpfp); + } + while (lpo); + } + } + + mutt_decoder_push (dec, NULL, 0, NULL); + do + { + mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo); + if (lpo) fwrite (tmp, lpo, 1, tmpfp); + } + while (lpo); + + mutt_free_decoder (&dec); fclose (fp); fp = NULL; rewind (tmpfp); @@ -53,31 +53,45 @@ typedef struct } CHARSET; -/* this one could be made a bit smaller with two levels - * of nested unions and structs. It's not worth the effort. - */ +#define DECODER_BUFFSIZE 4096 + +struct decoder_buff +{ + size_t size, used; + char buff[DECODER_BUFFSIZE]; +}; typedef struct decoder { - STATE *s; - short is_utf8; - CHARSET_MAP *map; + short src_is_utf8; + short just_take_id; + short forced; + + /* used for utf-8 decoding */ CHARSET *chs; - struct utf8_state *sfu; -} -DECODER; + /* used for 8-bit to 8-bit recoding */ + CHARSET_MAP *chm; + + /* the buffers */ + struct decoder_buff in; + struct decoder_buff out; +} +DECODER; + +DECODER *mutt_open_decoder (const char *, const char *); +void mutt_decoder_push (DECODER *, void *, size_t, size_t *); +void mutt_decoder_pop (DECODER *, void *, size_t, size_t *); +void mutt_decoder_pop_to_state (DECODER *, STATE *); +void mutt_free_decoder (DECODER **); CHARSET *mutt_get_charset(const char *); CHARSET_MAP *mutt_get_translation(const char *, const char *); -DECODER *mutt_open_decoder (STATE *, BODY *, int); int mutt_display_string(char *, CHARSET_MAP *); int mutt_is_utf8(const char *); int mutt_recode_file (const char *, const char *, const char *); unsigned char mutt_display_char(unsigned char, CHARSET_MAP *); -void mutt_close_decoder (DECODER **); void mutt_decode_utf8_string(char *, CHARSET *); -void mutt_decoder_putc (DECODER *, char); #endif @@ -64,17 +64,17 @@ int Index_64[128] = { 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1 }; -void mutt_decode_xbit (STATE *s, BODY *b, int istext) +void mutt_decode_xbit (STATE *s, BODY *b, int istext, DECODER *dec) { long len = b->length; int c, ch; - + char cc; + size_t l; + if (istext) { - DECODER *dec = mutt_open_decoder (s, b, istext); - state_set_prefix(s); - + while ((c = fgetc(s->fpin)) != EOF && len--) { if(c == '\r' && len) @@ -87,10 +87,16 @@ void mutt_decode_xbit (STATE *s, BODY *b, int istext) else ungetc(ch, s->fpin); } - - mutt_decoder_putc (dec, c); + + cc = c; + mutt_decoder_push (dec, &cc, 1, &l); + mutt_decoder_pop_to_state (dec, s); + } - mutt_close_decoder (&dec); + + mutt_decoder_push (dec, NULL, 0, NULL); + mutt_decoder_pop_to_state (dec, s); + state_reset_prefix (s); } else @@ -109,14 +115,15 @@ static int handler_state_fgetc(STATE *s) return ch; } -void mutt_decode_quoted (STATE *s, BODY *b, int istext) +void mutt_decode_quoted (STATE *s, BODY *b, int istext, DECODER *dec) { long len = b->length; int ch; - DECODER *dec = mutt_open_decoder (s, b, istext); - + char cc; + size_t l; + state_set_prefix(s); - + while (len > 0) { if ((ch = handler_state_fgetc(s)) == EOF) @@ -171,20 +178,27 @@ void mutt_decode_quoted (STATE *s, BODY *b, int istext) } if(ch != EOF) - mutt_decoder_putc (dec, ch); + { + cc = ch; + mutt_decoder_push (dec, &cc, 1, &l); + mutt_decoder_pop_to_state (dec, s); + } } - mutt_close_decoder (&dec); + mutt_decoder_push (dec, NULL, 0, NULL); + mutt_decoder_pop_to_state (dec, s); + state_reset_prefix(s); } -void mutt_decode_base64 (STATE *s, BODY *b, int istext) +void mutt_decode_base64 (STATE *s, BODY *b, int istext, DECODER *dec) { long len = b->length; char buf[5]; int c1, c2, c3, c4, ch, cr = 0, i; - DECODER *dec = mutt_open_decoder (s, b, istext); - + char cc; + size_t l; + buf[4] = 0; if (istext) state_set_prefix(s); @@ -206,13 +220,19 @@ void mutt_decode_base64 (STATE *s, BODY *b, int istext) ch = (c1 << 2) | (c2 >> 4); if (cr && ch != '\n') - mutt_decoder_putc (dec, '\r'); + { + cc = '\r'; + mutt_decoder_push (dec, &cc, 1, &l); + } cr = 0; if (istext && ch == '\r') cr = 1; else - mutt_decoder_putc (dec, ch); + { + cc = ch; + mutt_decoder_push (dec, &cc, 1, &l); + } if (buf[2] == '=') break; @@ -220,29 +240,45 @@ void mutt_decode_base64 (STATE *s, BODY *b, int istext) ch = ((c2 & 0xf) << 4) | (c3 >> 2); if (cr && ch != '\n') - mutt_decoder_putc (dec, ch); + { + cc = ch; + mutt_decoder_push (dec, &cc, 1, &l); + } cr = 0; if (istext && ch == '\r') cr = 1; else - mutt_decoder_putc (dec, ch); + { + cc = ch; + mutt_decoder_push (dec, &cc, 1, &l); + } if (buf[3] == '=') break; c4 = base64val (buf[3]); ch = ((c3 & 0x3) << 6) | c4; if (cr && ch != '\n') - mutt_decoder_putc (dec, ch); + { + cc = ch; + mutt_decoder_push (dec, &cc, 1, &l); + } cr = 0; if (istext && ch == '\r') cr = 1; else - mutt_decoder_putc (dec, ch); + { + cc = ch; + mutt_decoder_push (dec, &cc, 1, &l); + } + + mutt_decoder_pop_to_state (dec, s); } - mutt_close_decoder (&dec); + mutt_decoder_push (dec, NULL, 0, NULL); + mutt_decoder_pop_to_state (dec, s); + state_reset_prefix(s); } @@ -253,13 +289,13 @@ unsigned char decode_byte (char ch) return ch - 32; } -void mutt_decode_uuencoded (STATE *s, BODY *b, int istext) +void mutt_decode_uuencoded (STATE *s, BODY *b, int istext, DECODER *dec) { char tmps[SHORT_STRING]; char linelen, c, l, out; char *pt; long len = b->length; - DECODER *dec = mutt_open_decoder (s, b, istext); + size_t dummy; if(istext) state_set_prefix(s); @@ -289,16 +325,19 @@ void mutt_decode_uuencoded (STATE *s, BODY *b, int istext) out = decode_byte (*pt) << l; pt++; out |= (decode_byte (*pt) >> (6 - l)); - mutt_decoder_putc (dec, out); + mutt_decoder_push (dec, &out, 1, &dummy); c++; if (c == linelen) break; } + mutt_decoder_pop_to_state (dec, s); pt++; } } - mutt_close_decoder (&dec); + mutt_decoder_push (dec, NULL, 0, NULL); + mutt_decoder_pop_to_state (dec, s); + state_reset_prefix(s); } @@ -1238,22 +1277,33 @@ static void external_body_handler (BODY *b, STATE *s) void mutt_decode_attachment (BODY *b, STATE *s) { + char *charset = mutt_get_parameter ("charset", b->parameter); + int istext = mutt_is_text_type (b->type, b->subtype); + DECODER *dec; + + if (istext && s->flags & M_CHARCONV) + dec = mutt_open_decoder (charset, Charset); + else + dec = mutt_open_decoder (NULL, NULL); + fseek (s->fpin, b->offset, 0); switch (b->encoding) { case ENCQUOTEDPRINTABLE: - mutt_decode_quoted (s, b, mutt_is_text_type (b->type, b->subtype)); + mutt_decode_quoted (s, b, istext, dec); break; case ENCBASE64: - mutt_decode_base64 (s, b, mutt_is_text_type (b->type, b->subtype)); + mutt_decode_base64 (s, b, istext, dec); break; case ENCUUENCODED: - mutt_decode_uuencoded (s, b, mutt_is_text_type (b->type, b->subtype)); + mutt_decode_uuencoded (s, b, istext, dec); break; default: - mutt_decode_xbit (s, b, mutt_is_text_type (b->type, b->subtype)); + mutt_decode_xbit (s, b, istext, dec); break; } + + mutt_free_decoder (&dec); } void mutt_body_handler (BODY *b, STATE *s) |