summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Roessler <roessler@does-not-exist.org>1999-03-30 23:50:33 +0000
committerThomas Roessler <roessler@does-not-exist.org>1999-03-30 23:50:33 +0000
commitec8c796bd17158c54f2415b1e71a82d309687126 (patch)
tree472eb9f8f428af3c73f39c6e63ea1cebad36facd
parenta784b853587106e064e2c72d2058b88223434a6a (diff)
This patch removes at least some of the horrible utf-8 kluges in
charset.c. The new DECODER framework is currently only used in handler.c, and there in a horribly inefficient manner. We should use greater blocks of data, which would be much more efficient than what we are currently doing. Most of the other charset-related code still uses the old mutt_display_char() &friends interface, which is actually ok as long as you don't try to handle multibyte character sets. The most notable change should be the one to mutt_get_translation(): It will delay the loading and parsing of character set information files until it's really needed, catching a huge amount of standard cases. As a side effect, this will make "iso tagged as ascii" "work" again, as long as both sides use the same iso character set.
-rw-r--r--charset.c281
-rw-r--r--charset.h38
-rw-r--r--handler.c114
3 files changed, 297 insertions, 136 deletions
diff --git a/charset.c b/charset.c
index e4fa80a8..b2ff695c 100644
--- a/charset.c
+++ b/charset.c
@@ -682,21 +682,30 @@ CHARSET_MAP *mutt_get_translation(const char *_from, const char *_to)
if(!_from || !_to)
return NULL;
-
- init_charsets();
canonical_charset(from_canon, sizeof(from_canon), _from);
canonical_charset(to_canon, sizeof(to_canon), _to);
+ /* quick check for some trivial cases. Doing this before
+ * we actually call the initialization routine delays character
+ * set loading until it's _really_ needed.
+ */
+
+ if(!strcmp(from_canon, to_canon)
+ || (!strcmp (from_canon, "us-ascii") && !strncmp (to_canon, "iso-8859", 8)))
+ return NULL;
+
+ init_charsets();
+
if(!CharsetAliases || !(from = hash_find(CharsetAliases, from_canon)))
from = from_canon;
if(!CharsetAliases || !(to = hash_find(CharsetAliases, to_canon)))
to = to_canon;
/* quick check for the identity mapping */
- if((from == to) || ((*from == *to) && !mutt_strcmp(from, to)))
+ if((from == to) || !mutt_strcmp(from, to))
return NULL;
-
+
snprintf(key, sizeof(key), "%s %s", from, to);
if((map = hash_find(Translations, key)) == NULL)
{
@@ -812,20 +821,25 @@ static char *utf_to_unicode(int *out, char *in)
static CHARSET *Unicode = NULL;
-void mutt_decode_utf8_string(char *str, CHARSET *chs)
+static int unicode_init (void)
{
- char *s, *t;
- CHARDESC *cd;
- int ch;
-
- /* Hack */
-
if (!Unicode)
{
if (load_charset ("ISO_10646", &Unicode, 1) == -1)
Unicode = NULL;
}
+ return (Unicode == NULL ? -1 : 0);
+}
+
+void mutt_decode_utf8_string(char *str, CHARSET *chs)
+{
+ char *s, *t;
+ CHARDESC *cd;
+ int ch;
+
+ (void) unicode_init ();
+
for (s = t = str; *t; s++)
{
t = utf_to_unicode(&ch, t);
@@ -844,128 +858,188 @@ void mutt_decode_utf8_string(char *str, CHARSET *chs)
*s = '\0';
}
-/* internal use only */
-struct utf8_state
-{
- char *buffer;
- size_t blen;
- size_t bp;
-};
-static struct utf8_state *new_utf8_state (void)
+
+/*************************************************************
+ * General decoder framework
+ */
+
+
+
+#define MIN(a,b) (((a) <= (b)) ? (a): (b))
+
+DECODER *mutt_open_decoder (const char *src, const char *dest)
{
- return safe_calloc (1, sizeof (struct utf8_state));
+ DECODER *d = safe_calloc (1, sizeof (DECODER));;
+
+ d->in.size = DECODER_BUFFSIZE;
+ d->out.size = DECODER_BUFFSIZE;
+
+ if (!src || !dest || mutt_is_utf8 (dest))
+ {
+ d->just_take_id = 1;
+ return d;
+ }
+
+ if (mutt_is_utf8 (src))
+ {
+ if (!(d->chs = mutt_get_charset (dest)) || unicode_init () == -1)
+ {
+ d->just_take_id = 1;
+ return d;
+ }
+
+ d->src_is_utf8 = 1;
+ return d;
+ }
+
+ if (!(d->chm = mutt_get_translation (src, dest)))
+ d->just_take_id = 1;
+
+ return d;
}
-static void free_utf8_state (struct utf8_state **sp)
+void mutt_free_decoder (DECODER **dpp)
{
- if (!sp || !*sp) return;
- safe_free ((void **) &(*sp)->buffer);
- safe_free ((void **) sp);
+ safe_free ((void **) dpp);
}
-static void _state_utf8_flush(STATE *s, CHARSET *chs, struct utf8_state *sfu)
+static void _process_data (DECODER *, short);
+
+void mutt_decoder_push (DECODER *d, void *_buff, size_t blen, size_t *taken)
{
- char *t;
- if(!sfu->buffer || !sfu->bp)
- return;
-
- sfu->buffer[sfu->bp] = '\0';
-
- mutt_decode_utf8_string(sfu->buffer, chs);
- for(t = sfu->buffer; *t; t++)
+ if (!_buff || !blen)
{
- /* This may lead to funny-looking output if
- * there are embedded CRs, NLs or similar things
- * - but these would constitute illegal
- * UTF8 encoding anyways, so we don't care.
- */
+ _process_data (d, 1);
+ return;
+ }
- state_prefix_putc(*t, s);
+ if ((*taken = MIN(blen, d->in.size - d->in.used)))
+ {
+ memcpy (d->in.buff + d->in.used, _buff, *taken);
+ d->in.used += *taken;
}
- sfu->bp = 0;
}
-
-static void state_fput_utf8(STATE *st, char u, CHARSET *chs, struct utf8_state *sfu)
+
+
+void mutt_decoder_pop (DECODER *d, void *_buff, size_t blen, size_t *popped)
{
- if((u & 0x80) == 0 || (sfu->bp && (u & IIOOOOOO) != IOOOOOOO))
- _state_utf8_flush(st, chs, sfu);
-
- if((u & 0x80) == 0)
+ unsigned char *buff = _buff;
+
+ _process_data (d, 0);
+
+ if ((*popped = MIN (blen, d->out.used)))
{
- if(u) state_prefix_putc(u, st);
+ memcpy (buff, d->out.buff, *popped);
+ memmove (d->out.buff, d->out.buff + *popped, d->out.used - *popped);
+ d->out.used -= *popped;
}
- else
+}
+
+void mutt_decoder_pop_to_state (DECODER *d, STATE *s)
+{
+ char tmp[DECODER_BUFFSIZE];
+ size_t i, l;
+
+ do
{
- if(sfu->bp + 1 >= sfu->blen)
- {
- sfu->blen = (sfu->blen + 80) * 2;
- safe_realloc((void **) &sfu->buffer, sfu->blen + 1);
- }
- sfu->buffer[sfu->bp++] = u;
+ mutt_decoder_pop (d, tmp, sizeof (tmp), &l);
+ for (i = 0; i < l; i++)
+ state_prefix_putc (tmp[i], s);
}
+ while (l > 0);
}
-/* a nicer interface for decoding */
+/* this is where things actually happen */
-DECODER *mutt_open_decoder (STATE *s, BODY *b, int istext)
+static void _process_data_8bit (DECODER *d)
{
- DECODER *dp = safe_calloc (1, sizeof (DECODER));
+ size_t i;
- dp->s = s;
+ for (i = 0; i < d->in.used && d->out.used < d->out.size; i++)
+ d->out.buff[d->out.used++] = mutt_display_char (d->in.buff[i], d->chm);
- if (istext && (s->flags & M_CHARCONV))
- {
- char *charset = mutt_get_parameter ("charset", b->parameter);
- dp->is_utf8 = mutt_is_utf8 (charset) && !mutt_is_utf8 (Charset);
-
- if (dp->is_utf8)
- {
- dp->sfu = new_utf8_state ();
- dp->chs = mutt_get_charset (Charset);
- }
- else
- dp->map = mutt_get_translation (charset, Charset);
- }
-
- return dp;
+ memmove (d->in.buff, d->in.buff + i, d->in.used - i);
+ d->in.used -= i;
}
-void mutt_close_decoder (DECODER **dpp)
+static void _process_data_utf8 (DECODER *d)
{
- if (!dpp || !*dpp)
- return;
+ size_t i, j;
+ CHARDESC *cd;
- if ((*dpp)->is_utf8)
+ for (i = 0, j = 0; i < d->in.used && d->out.used < d->out.size;)
{
- _state_utf8_flush ((*dpp)->s, (*dpp)->chs, (*dpp)->sfu);
- free_utf8_state (&(*dpp)->sfu);
+ while (((d->in.buff[j] & 0x80) == 0) && (j < d->in.used) && (d->out.used < d->out.size))
+ d->out.buff[d->out.used++] = d->in.buff[j++];
+ i = j;
+
+ while ((d->in.buff[j] & 0x80) && j < d->in.used &&
+ (d->forced || j + 6 < d->in.used) && d->out.used < d->out.size)
+ {
+ int ch;
+ char *c = utf_to_unicode (&ch, &d->in.buff[j]);
+
+ j = c - d->in.buff;
+
+ if (0 <= ch && ch < 128)
+ d->out.buff[d->out.used] = ch;
+ else if ((cd = repr2descr (ch, Unicode)) && (ch = translate_character (d->chs, cd->symbol)) != -1)
+ d->out.buff[d->out.used] = ch;
+ else
+ d->out.buff[d->out.used] = '?';
+
+ if(!d->out.buff[d->out.used])
+ d->out.buff[d->out.used] = '?';
+
+ d->out.used++;
+ }
+
+ i = j;
+
+ if (d->in.buff[j] & 0x80)
+ break;
}
- safe_free ((void **) dpp);
+ memmove (d->in.buff, d->in.buff + i, d->in.used - i);
+ d->in.used -= i;
}
-void mutt_decoder_putc (DECODER *dp, char c)
+static void _process_data (DECODER *d, short force)
{
- if (dp->is_utf8)
- state_fput_utf8 (dp->s, c, dp->chs, dp->sfu);
+ if (force) d->forced = 1;
+
+ if (d->just_take_id)
+ {
+ size_t l = MIN (d->out.size - d->out.used, d->in.used);
+ memmove (d->out.buff + d->out.used, d->in.buff, l);
+ memmove (d->in.buff, d->in.buff + l, d->in.used - l);
+ d->in.used -= l;
+ d->out.used += l;
+ }
+ else if (d->src_is_utf8)
+ _process_data_utf8 (d);
else
- state_prefix_putc (mutt_display_char ((unsigned char) c, dp->map), dp->s);
+ _process_data_8bit (d);
}
-/* FIXME: utf-8 support */
+/* This one is currently lacking utf-8 support */
int mutt_recode_file (const char *fname, const char *src, const char *dest)
{
FILE *fp, *tmpfp;
char tempfile[_POSIX_PATH_MAX];
+ char buffer[1024];
+ char tmp[1024];
int c;
int rv = -1;
-
- CHARSET_MAP *map;
- if (mutt_is_utf8 (dest) ^ mutt_is_utf8(src))
+ size_t lf, lpu, lpo;
+ char *t;
+ DECODER *dec;
+
+ if (mutt_is_utf8 (dest) && !mutt_is_utf8 (src))
{
mutt_error (_("We can't currently handle utf-8 at this point."));
return -1;
@@ -985,11 +1059,34 @@ int mutt_recode_file (const char *fname, const char *src, const char *dest)
return -1;
}
- map = mutt_get_translation (src, dest);
+ dec = mutt_open_decoder (src, dest);
- while ((c = fgetc (fp)) != EOF)
- if (fputc (mutt_display_char ((unsigned char) c, map), tmpfp) == EOF)
- goto bail;
+ while ((lf = fread (buffer, 1, sizeof (buffer), fp)) > 0)
+ {
+ for (t = buffer; lf; t += lpu)
+ {
+ mutt_decoder_push (dec, t, lf, &lpu);
+ lf -= lpu;
+
+ do
+ {
+ mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo);
+ if (lpo)
+ fwrite (tmp, lpo, 1, tmpfp);
+ }
+ while (lpo);
+ }
+ }
+
+ mutt_decoder_push (dec, NULL, 0, NULL);
+ do
+ {
+ mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo);
+ if (lpo) fwrite (tmp, lpo, 1, tmpfp);
+ }
+ while (lpo);
+
+ mutt_free_decoder (&dec);
fclose (fp); fp = NULL;
rewind (tmpfp);
diff --git a/charset.h b/charset.h
index 39522428..8f466ef5 100644
--- a/charset.h
+++ b/charset.h
@@ -53,31 +53,45 @@ typedef struct
}
CHARSET;
-/* this one could be made a bit smaller with two levels
- * of nested unions and structs. It's not worth the effort.
- */
+#define DECODER_BUFFSIZE 4096
+
+struct decoder_buff
+{
+ size_t size, used;
+ char buff[DECODER_BUFFSIZE];
+};
typedef struct decoder
{
- STATE *s;
- short is_utf8;
- CHARSET_MAP *map;
+ short src_is_utf8;
+ short just_take_id;
+ short forced;
+
+ /* used for utf-8 decoding */
CHARSET *chs;
- struct utf8_state *sfu;
-}
-DECODER;
+ /* used for 8-bit to 8-bit recoding */
+ CHARSET_MAP *chm;
+
+ /* the buffers */
+ struct decoder_buff in;
+ struct decoder_buff out;
+}
+DECODER;
+
+DECODER *mutt_open_decoder (const char *, const char *);
+void mutt_decoder_push (DECODER *, void *, size_t, size_t *);
+void mutt_decoder_pop (DECODER *, void *, size_t, size_t *);
+void mutt_decoder_pop_to_state (DECODER *, STATE *);
+void mutt_free_decoder (DECODER **);
CHARSET *mutt_get_charset(const char *);
CHARSET_MAP *mutt_get_translation(const char *, const char *);
-DECODER *mutt_open_decoder (STATE *, BODY *, int);
int mutt_display_string(char *, CHARSET_MAP *);
int mutt_is_utf8(const char *);
int mutt_recode_file (const char *, const char *, const char *);
unsigned char mutt_display_char(unsigned char, CHARSET_MAP *);
-void mutt_close_decoder (DECODER **);
void mutt_decode_utf8_string(char *, CHARSET *);
-void mutt_decoder_putc (DECODER *, char);
#endif
diff --git a/handler.c b/handler.c
index 0c5b4e43..66c15df9 100644
--- a/handler.c
+++ b/handler.c
@@ -64,17 +64,17 @@ int Index_64[128] = {
41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
};
-void mutt_decode_xbit (STATE *s, BODY *b, int istext)
+void mutt_decode_xbit (STATE *s, BODY *b, int istext, DECODER *dec)
{
long len = b->length;
int c, ch;
-
+ char cc;
+ size_t l;
+
if (istext)
{
- DECODER *dec = mutt_open_decoder (s, b, istext);
-
state_set_prefix(s);
-
+
while ((c = fgetc(s->fpin)) != EOF && len--)
{
if(c == '\r' && len)
@@ -87,10 +87,16 @@ void mutt_decode_xbit (STATE *s, BODY *b, int istext)
else
ungetc(ch, s->fpin);
}
-
- mutt_decoder_putc (dec, c);
+
+ cc = c;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ mutt_decoder_pop_to_state (dec, s);
+
}
- mutt_close_decoder (&dec);
+
+ mutt_decoder_push (dec, NULL, 0, NULL);
+ mutt_decoder_pop_to_state (dec, s);
+
state_reset_prefix (s);
}
else
@@ -109,14 +115,15 @@ static int handler_state_fgetc(STATE *s)
return ch;
}
-void mutt_decode_quoted (STATE *s, BODY *b, int istext)
+void mutt_decode_quoted (STATE *s, BODY *b, int istext, DECODER *dec)
{
long len = b->length;
int ch;
- DECODER *dec = mutt_open_decoder (s, b, istext);
-
+ char cc;
+ size_t l;
+
state_set_prefix(s);
-
+
while (len > 0)
{
if ((ch = handler_state_fgetc(s)) == EOF)
@@ -171,20 +178,27 @@ void mutt_decode_quoted (STATE *s, BODY *b, int istext)
}
if(ch != EOF)
- mutt_decoder_putc (dec, ch);
+ {
+ cc = ch;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ mutt_decoder_pop_to_state (dec, s);
+ }
}
- mutt_close_decoder (&dec);
+ mutt_decoder_push (dec, NULL, 0, NULL);
+ mutt_decoder_pop_to_state (dec, s);
+
state_reset_prefix(s);
}
-void mutt_decode_base64 (STATE *s, BODY *b, int istext)
+void mutt_decode_base64 (STATE *s, BODY *b, int istext, DECODER *dec)
{
long len = b->length;
char buf[5];
int c1, c2, c3, c4, ch, cr = 0, i;
- DECODER *dec = mutt_open_decoder (s, b, istext);
-
+ char cc;
+ size_t l;
+
buf[4] = 0;
if (istext) state_set_prefix(s);
@@ -206,13 +220,19 @@ void mutt_decode_base64 (STATE *s, BODY *b, int istext)
ch = (c1 << 2) | (c2 >> 4);
if (cr && ch != '\n')
- mutt_decoder_putc (dec, '\r');
+ {
+ cc = '\r';
+ mutt_decoder_push (dec, &cc, 1, &l);
+ }
cr = 0;
if (istext && ch == '\r')
cr = 1;
else
- mutt_decoder_putc (dec, ch);
+ {
+ cc = ch;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ }
if (buf[2] == '=')
break;
@@ -220,29 +240,45 @@ void mutt_decode_base64 (STATE *s, BODY *b, int istext)
ch = ((c2 & 0xf) << 4) | (c3 >> 2);
if (cr && ch != '\n')
- mutt_decoder_putc (dec, ch);
+ {
+ cc = ch;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ }
cr = 0;
if (istext && ch == '\r')
cr = 1;
else
- mutt_decoder_putc (dec, ch);
+ {
+ cc = ch;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ }
if (buf[3] == '=') break;
c4 = base64val (buf[3]);
ch = ((c3 & 0x3) << 6) | c4;
if (cr && ch != '\n')
- mutt_decoder_putc (dec, ch);
+ {
+ cc = ch;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ }
cr = 0;
if (istext && ch == '\r')
cr = 1;
else
- mutt_decoder_putc (dec, ch);
+ {
+ cc = ch;
+ mutt_decoder_push (dec, &cc, 1, &l);
+ }
+
+ mutt_decoder_pop_to_state (dec, s);
}
- mutt_close_decoder (&dec);
+ mutt_decoder_push (dec, NULL, 0, NULL);
+ mutt_decoder_pop_to_state (dec, s);
+
state_reset_prefix(s);
}
@@ -253,13 +289,13 @@ unsigned char decode_byte (char ch)
return ch - 32;
}
-void mutt_decode_uuencoded (STATE *s, BODY *b, int istext)
+void mutt_decode_uuencoded (STATE *s, BODY *b, int istext, DECODER *dec)
{
char tmps[SHORT_STRING];
char linelen, c, l, out;
char *pt;
long len = b->length;
- DECODER *dec = mutt_open_decoder (s, b, istext);
+ size_t dummy;
if(istext)
state_set_prefix(s);
@@ -289,16 +325,19 @@ void mutt_decode_uuencoded (STATE *s, BODY *b, int istext)
out = decode_byte (*pt) << l;
pt++;
out |= (decode_byte (*pt) >> (6 - l));
- mutt_decoder_putc (dec, out);
+ mutt_decoder_push (dec, &out, 1, &dummy);
c++;
if (c == linelen)
break;
}
+ mutt_decoder_pop_to_state (dec, s);
pt++;
}
}
- mutt_close_decoder (&dec);
+ mutt_decoder_push (dec, NULL, 0, NULL);
+ mutt_decoder_pop_to_state (dec, s);
+
state_reset_prefix(s);
}
@@ -1238,22 +1277,33 @@ static void external_body_handler (BODY *b, STATE *s)
void mutt_decode_attachment (BODY *b, STATE *s)
{
+ char *charset = mutt_get_parameter ("charset", b->parameter);
+ int istext = mutt_is_text_type (b->type, b->subtype);
+ DECODER *dec;
+
+ if (istext && s->flags & M_CHARCONV)
+ dec = mutt_open_decoder (charset, Charset);
+ else
+ dec = mutt_open_decoder (NULL, NULL);
+
fseek (s->fpin, b->offset, 0);
switch (b->encoding)
{
case ENCQUOTEDPRINTABLE:
- mutt_decode_quoted (s, b, mutt_is_text_type (b->type, b->subtype));
+ mutt_decode_quoted (s, b, istext, dec);
break;
case ENCBASE64:
- mutt_decode_base64 (s, b, mutt_is_text_type (b->type, b->subtype));
+ mutt_decode_base64 (s, b, istext, dec);
break;
case ENCUUENCODED:
- mutt_decode_uuencoded (s, b, mutt_is_text_type (b->type, b->subtype));
+ mutt_decode_uuencoded (s, b, istext, dec);
break;
default:
- mutt_decode_xbit (s, b, mutt_is_text_type (b->type, b->subtype));
+ mutt_decode_xbit (s, b, istext, dec);
break;
}
+
+ mutt_free_decoder (&dec);
}
void mutt_body_handler (BODY *b, STATE *s)