This patch removes at least some of the horrible utf-8 kluges in

charset.c. The new DECODER framework is currently only used in handler.c, and there in a horribly inefficient manner. We should use greater blocks of data, which would be much more efficient than what we are currently doing. Most of the other charset-related code still uses the old mutt_display_char() &friends interface, which is actually ok as long as you don't try to handle multibyte character sets. The most notable change should be the one to mutt_get_translation(): It will delay the loading and parsing of character set information files until it's really needed, catching a huge amount of standard cases. As a side effect, this will make "iso tagged as ascii" "work" again, as long as both sides use the same iso character set.
author: Thomas Roessler <roessler@does-not-exist.org> 1999-03-30 23:50:33 +0000
committer: Thomas Roessler <roessler@does-not-exist.org> 1999-03-30 23:50:33 +0000
commit: ec8c796bd17158c54f2415b1e71a82d309687126 (patch)
tree: 472eb9f8f428af3c73f39c6e63ea1cebad36facd
parent: a784b853587106e064e2c72d2058b88223434a6a (diff)
3 files changed, 297 insertions, 136 deletions
diff --git a/charset.c b/charset.c
index e4fa80a8..b2ff695c 100644
--- a/charset.c
+++ b/charset.c
@@ -682,21 +682,30 @@ CHARSET_MAP *mutt_get_translation(const char *_from, const char *_to)
 
   if(!_from || !_to)
     return NULL;
-  
-  init_charsets();
 
   canonical_charset(from_canon, sizeof(from_canon), _from);
   canonical_charset(to_canon, sizeof(to_canon), _to);
 
+  /* quick check for some trivial cases.  Doing this before
+   * we actually call the initialization routine delays character
+   * set loading until it's _really_ needed.
+   */
+
+  if(!strcmp(from_canon, to_canon)
+     || (!strcmp (from_canon, "us-ascii") && !strncmp (to_canon, "iso-8859", 8)))
+    return NULL;
+
+  init_charsets();
+
   if(!CharsetAliases || !(from = hash_find(CharsetAliases, from_canon)))
     from = from_canon;
   if(!CharsetAliases || !(to = hash_find(CharsetAliases, to_canon)))
     to = to_canon;
   
   /* quick check for the identity mapping */
-  if((from == to) || ((*from == *to) && !mutt_strcmp(from, to)))
+  if((from == to) || !mutt_strcmp(from, to))
     return NULL;
-  
+
   snprintf(key, sizeof(key), "%s %s", from, to);
   if((map = hash_find(Translations, key)) == NULL)
   {
@@ -812,20 +821,25 @@ static char *utf_to_unicode(int *out, char *in)
 
 static CHARSET *Unicode = NULL;
 
-void mutt_decode_utf8_string(char *str, CHARSET *chs)
+static int unicode_init (void)
 {
-  char *s, *t;
-  CHARDESC *cd;
-  int ch;
-
-  /* Hack */
-  
   if (!Unicode)
   {
     if (load_charset ("ISO_10646", &Unicode, 1) == -1)
       Unicode = NULL;
   }
   
+  return (Unicode == NULL ? -1 : 0);
+}
+
+void mutt_decode_utf8_string(char *str, CHARSET *chs)
+{
+  char *s, *t;
+  CHARDESC *cd;
+  int ch;
+
+  (void) unicode_init ();
+  
   for (s = t = str; *t; s++)
   {
     t = utf_to_unicode(&ch, t);
@@ -844,128 +858,188 @@ void mutt_decode_utf8_string(char *str, CHARSET *chs)
   *s = '\0';
 }
 
-/* internal use only */
 
-struct utf8_state
-{
-  char *buffer;
-  size_t blen;
-  size_t bp;
-};
 
-static struct utf8_state *new_utf8_state (void)
+
+/*************************************************************
+ * General decoder framework
+ */
+
+
+
+#define MIN(a,b) (((a) <= (b)) ? (a): (b))
+
+DECODER *mutt_open_decoder (const char *src, const char *dest)
 {
-  return safe_calloc (1, sizeof (struct utf8_state));
+  DECODER *d = safe_calloc (1, sizeof (DECODER));;
+
+  d->in.size = DECODER_BUFFSIZE;
+  d->out.size = DECODER_BUFFSIZE;
+
+  if (!src || !dest || mutt_is_utf8 (dest))
+  {
+    d->just_take_id = 1;
+    return d;
+  }
+  
+  if (mutt_is_utf8 (src))
+  {
+    if (!(d->chs = mutt_get_charset (dest)) || unicode_init () == -1)
+    {
+      d->just_take_id = 1;
+      return d;
+    }
+    
+    d->src_is_utf8 = 1;
+    return d;
+  }
+  
+  if (!(d->chm = mutt_get_translation (src, dest)))
+    d->just_take_id = 1;
+  
+  return d;
 }
 
-static void free_utf8_state (struct utf8_state **sp)
+void mutt_free_decoder (DECODER **dpp)
 {
-  if (!sp || !*sp) return;
-  safe_free ((void **) &(*sp)->buffer);
-  safe_free ((void **) sp);
+  safe_free ((void **) dpp);
 }
 
-static void _state_utf8_flush(STATE *s, CHARSET *chs, struct utf8_state *sfu)
+static void _process_data (DECODER *, short);
+
+void mutt_decoder_push (DECODER *d, void *_buff, size_t blen, size_t *taken)
 {
-  char *t;
-  if(!sfu->buffer || !sfu->bp)
-    return;
-  
-  sfu->buffer[sfu->bp] = '\0';
-  
-  mutt_decode_utf8_string(sfu->buffer, chs);
-  for(t = sfu->buffer; *t; t++)
+  if (!_buff || !blen)
   {
-    /* This may lead to funny-looking output if 
-     * there are embedded CRs, NLs or similar things
-     * - but these would constitute illegal 
-     * UTF8 encoding anyways, so we don't care.
-     */
+    _process_data (d, 1);
+    return;
+  }
 
-    state_prefix_putc(*t, s);
+  if ((*taken = MIN(blen, d->in.size - d->in.used)))
+  {
+    memcpy (d->in.buff + d->in.used, _buff, *taken);
+    d->in.used += *taken;
   }
-  sfu->bp = 0;
 }
-    
-static void state_fput_utf8(STATE *st, char u, CHARSET *chs, struct utf8_state *sfu)
+
+
+void mutt_decoder_pop (DECODER *d, void *_buff, size_t blen, size_t *popped)
 {
-  if((u & 0x80) == 0 || (sfu->bp && (u & IIOOOOOO) != IOOOOOOO))
-    _state_utf8_flush(st, chs, sfu);
-     
-  if((u & 0x80) == 0)
+  unsigned char *buff = _buff;
+
+  _process_data (d, 0);
+  
+  if ((*popped = MIN (blen, d->out.used)))
   {
-    if(u) state_prefix_putc(u, st);
+    memcpy (buff, d->out.buff, *popped);
+    memmove (d->out.buff, d->out.buff + *popped, d->out.used - *popped);
+    d->out.used -= *popped;
   }
-  else
+}
+
+void mutt_decoder_pop_to_state (DECODER *d, STATE *s)
+{
+  char tmp[DECODER_BUFFSIZE];
+  size_t i, l;
+  
+  do 
   {
-    if(sfu->bp + 1 >= sfu->blen)
-    {
-      sfu->blen = (sfu->blen + 80) * 2;
-      safe_realloc((void **) &sfu->buffer, sfu->blen + 1);
-    }
-    sfu->buffer[sfu->bp++] = u;
+    mutt_decoder_pop (d, tmp, sizeof (tmp), &l);
+    for (i = 0; i < l; i++)
+      state_prefix_putc (tmp[i], s);
   }
+  while (l > 0);
 }
 
-/* a nicer interface for decoding */
+/* this is where things actually happen */
 
-DECODER *mutt_open_decoder (STATE *s, BODY *b, int istext)
+static void _process_data_8bit (DECODER *d)
 {
-  DECODER *dp = safe_calloc (1, sizeof (DECODER));
+  size_t i;
   
-  dp->s = s;
+  for (i = 0; i < d->in.used && d->out.used < d->out.size; i++)
+    d->out.buff[d->out.used++] = mutt_display_char (d->in.buff[i], d->chm);
   
-  if (istext && (s->flags & M_CHARCONV))
-  {
-    char *charset = mutt_get_parameter ("charset", b->parameter);
-    dp->is_utf8 = mutt_is_utf8 (charset) && !mutt_is_utf8 (Charset);
-    
-    if (dp->is_utf8)
-    {
-      dp->sfu = new_utf8_state ();
-      dp->chs = mutt_get_charset (Charset);
-    }
-    else
-      dp->map = mutt_get_translation (charset, Charset);
-  }
-  
-  return dp;
+  memmove (d->in.buff, d->in.buff + i, d->in.used - i);
+  d->in.used -= i;
 }
 
-void mutt_close_decoder (DECODER **dpp)
+static void _process_data_utf8 (DECODER *d)
 {
-  if (!dpp || !*dpp)
-    return;
+  size_t i, j;
+  CHARDESC *cd;
   
-  if ((*dpp)->is_utf8)
+  for (i = 0, j = 0; i < d->in.used && d->out.used < d->out.size;)
   {
-    _state_utf8_flush ((*dpp)->s, (*dpp)->chs, (*dpp)->sfu);
-    free_utf8_state (&(*dpp)->sfu);
+    while (((d->in.buff[j] & 0x80) == 0) && (j < d->in.used) && (d->out.used < d->out.size))
+      d->out.buff[d->out.used++] = d->in.buff[j++];
+    i = j;
+
+    while ((d->in.buff[j] & 0x80) && j < d->in.used &&
+	   (d->forced || j + 6 < d->in.used) && d->out.used < d->out.size)
+    {
+      int ch;
+      char *c = utf_to_unicode (&ch, &d->in.buff[j]);
+      
+      j = c - d->in.buff;
+
+      if (0 <= ch && ch < 128)
+	d->out.buff[d->out.used] = ch;
+      else if ((cd = repr2descr (ch, Unicode)) && (ch = translate_character (d->chs, cd->symbol)) != -1)
+	d->out.buff[d->out.used] = ch;
+      else
+	d->out.buff[d->out.used] = '?';
+      
+      if(!d->out.buff[d->out.used]) 
+	d->out.buff[d->out.used] = '?';
+      
+      d->out.used++;
+    }
+    
+    i = j;
+    
+    if (d->in.buff[j] & 0x80)
+      break;
   }
 
-  safe_free ((void **) dpp);
+  memmove (d->in.buff, d->in.buff + i, d->in.used - i);
+  d->in.used -= i;
 }
 
-void mutt_decoder_putc (DECODER *dp, char c)
+static void _process_data (DECODER *d, short force)
 {
-  if (dp->is_utf8)
-    state_fput_utf8 (dp->s, c, dp->chs, dp->sfu);
+  if (force) d->forced = 1;
+  
+  if (d->just_take_id)
+  {
+    size_t l = MIN (d->out.size - d->out.used, d->in.used);
+    memmove (d->out.buff + d->out.used, d->in.buff, l);
+    memmove (d->in.buff, d->in.buff + l, d->in.used - l);
+    d->in.used -= l;
+    d->out.used += l;
+  }
+  else if (d->src_is_utf8)
+    _process_data_utf8 (d);
   else
-    state_prefix_putc (mutt_display_char ((unsigned char) c, dp->map), dp->s);
+    _process_data_8bit (d);
 }
 
-/* FIXME: utf-8 support */
+/* This one is currently lacking utf-8 support */
 
 int mutt_recode_file (const char *fname, const char *src, const char *dest)
 {
   FILE *fp, *tmpfp;
   char tempfile[_POSIX_PATH_MAX];
+  char buffer[1024];
+  char tmp[1024];
   int c;
   int rv = -1;
-  
-  CHARSET_MAP *map;
 
-  if (mutt_is_utf8 (dest) ^ mutt_is_utf8(src))
+  size_t lf, lpu, lpo;
+  char *t;
+  DECODER *dec;
+
+  if (mutt_is_utf8 (dest) && !mutt_is_utf8 (src))
   {
     mutt_error (_("We can't currently handle utf-8 at this point."));
     return -1;
@@ -985,11 +1059,34 @@ int mutt_recode_file (const char *fname, const char *src, const char *dest)
     return -1;
   }
 
-  map = mutt_get_translation (src, dest);
+  dec = mutt_open_decoder (src, dest);
   
-  while ((c = fgetc (fp)) != EOF)
-    if (fputc (mutt_display_char ((unsigned char) c, map), tmpfp) == EOF)
-      goto bail;
+  while ((lf = fread (buffer, 1, sizeof (buffer), fp)) > 0)
+  {
+    for (t = buffer; lf; t += lpu)
+    {
+      mutt_decoder_push (dec, t, lf, &lpu);
+      lf -= lpu;
+      
+      do
+      {
+	mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo);
+	if (lpo)
+	  fwrite (tmp, lpo, 1, tmpfp);
+      } 
+      while (lpo);
+    }
+  }
+
+  mutt_decoder_push (dec, NULL, 0, NULL);
+  do 
+  {
+    mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo);
+    if (lpo) fwrite (tmp, lpo, 1, tmpfp);
+  }
+  while (lpo);
+
+  mutt_free_decoder (&dec);
 
   fclose (fp); fp = NULL;
   rewind (tmpfp);
diff --git a/charset.h b/charset.h
index 39522428..8f466ef5 100644
--- a/charset.h
+++ b/charset.h
@@ -53,31 +53,45 @@ typedef struct
 }
 CHARSET;
 
-/* this one could be made a bit smaller with two levels
- * of nested unions and structs.  It's not worth the effort.
- */
+#define DECODER_BUFFSIZE 4096
+
+struct decoder_buff
+{
+  size_t size, used;
+  char buff[DECODER_BUFFSIZE];
+};
 
 typedef struct decoder
 {
-  STATE *s;
-  short is_utf8;
-  CHARSET_MAP *map;
+  short src_is_utf8;
+  short just_take_id;
+  short forced;
+  
+  /* used for utf-8 decoding */
   CHARSET *chs;
-  struct utf8_state *sfu;
-}
-DECODER;    
 
+  /* used for 8-bit to 8-bit recoding */
+  CHARSET_MAP *chm;
+  
+  /* the buffers */
+  struct decoder_buff in;
+  struct decoder_buff out;
+} 
+DECODER;
+
+DECODER *mutt_open_decoder (const char *, const char *);
+void mutt_decoder_push (DECODER *, void *, size_t, size_t *);
+void mutt_decoder_pop (DECODER *, void *, size_t, size_t *);
+void mutt_decoder_pop_to_state (DECODER *, STATE *);
+void mutt_free_decoder (DECODER **);
 
 CHARSET *mutt_get_charset(const char *);
 CHARSET_MAP *mutt_get_translation(const char *, const char *);
-DECODER *mutt_open_decoder (STATE *, BODY *, int);
 int mutt_display_string(char *, CHARSET_MAP *);
 int mutt_is_utf8(const char *);
 int mutt_recode_file (const char *, const char *, const char *);
 unsigned char mutt_display_char(unsigned char, CHARSET_MAP *);
-void mutt_close_decoder (DECODER **);
 void mutt_decode_utf8_string(char *, CHARSET *);
-void mutt_decoder_putc (DECODER *, char);
 
 #endif
 
diff --git a/handler.c b/handler.c
index 0c5b4e43..66c15df9 100644
--- a/handler.c
+++ b/handler.c
@@ -64,17 +64,17 @@ int Index_64[128] = {
     41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
 };
 
-void mutt_decode_xbit (STATE *s, BODY *b, int istext)
+void mutt_decode_xbit (STATE *s, BODY *b, int istext, DECODER *dec)
 {
   long len = b->length;
   int c, ch;
-  
+  char cc;
+  size_t l;
+
   if (istext)
   {
-    DECODER *dec = mutt_open_decoder (s, b, istext);
-
     state_set_prefix(s);
-    
+
     while ((c = fgetc(s->fpin)) != EOF && len--)
     {
       if(c == '\r' && len)
@@ -87,10 +87,16 @@ void mutt_decode_xbit (STATE *s, BODY *b, int istext)
 	else 
 	  ungetc(ch, s->fpin);
       }
-	
-      mutt_decoder_putc (dec, c);
+
+      cc = c;
+      mutt_decoder_push (dec, &cc, 1, &l);
+      mutt_decoder_pop_to_state (dec, s);
+      
     }
-    mutt_close_decoder (&dec);
+    
+    mutt_decoder_push (dec, NULL, 0, NULL);
+    mutt_decoder_pop_to_state (dec, s);
+
     state_reset_prefix (s);
   }
   else
@@ -109,14 +115,15 @@ static int handler_state_fgetc(STATE *s)
   return ch;
 }
 
-void mutt_decode_quoted (STATE *s, BODY *b, int istext)
+void mutt_decode_quoted (STATE *s, BODY *b, int istext, DECODER *dec)
 {
   long len = b->length;
   int ch;
-  DECODER *dec = mutt_open_decoder (s, b, istext);
-  
+  char cc;
+  size_t l;
+
   state_set_prefix(s);
-  
+
   while (len > 0)
   {
     if ((ch = handler_state_fgetc(s)) == EOF)
@@ -171,20 +178,27 @@ void mutt_decode_quoted (STATE *s, BODY *b, int istext)
     }
 
     if(ch != EOF)
-      mutt_decoder_putc (dec, ch);
+    {
+      cc = ch;
+      mutt_decoder_push (dec, &cc, 1, &l);
+      mutt_decoder_pop_to_state (dec, s);
+    }
   }
 
-  mutt_close_decoder (&dec);
+  mutt_decoder_push (dec, NULL, 0, NULL);
+  mutt_decoder_pop_to_state (dec, s);
+  
   state_reset_prefix(s);
 }
 
-void mutt_decode_base64 (STATE *s, BODY *b, int istext)
+void mutt_decode_base64 (STATE *s, BODY *b, int istext, DECODER *dec)
 {
   long len = b->length;
   char buf[5];
   int c1, c2, c3, c4, ch, cr = 0, i;
-  DECODER *dec = mutt_open_decoder (s, b, istext);
-  
+  char cc;
+  size_t l;
+
   buf[4] = 0;
 
   if (istext) state_set_prefix(s);
@@ -206,13 +220,19 @@ void mutt_decode_base64 (STATE *s, BODY *b, int istext)
     ch = (c1 << 2) | (c2 >> 4);
 
     if (cr && ch != '\n') 
-      mutt_decoder_putc (dec, '\r');
+    {
+      cc = '\r';
+      mutt_decoder_push (dec, &cc, 1, &l);
+    }
     cr = 0;
       
     if (istext && ch == '\r')
       cr = 1;
     else
-      mutt_decoder_putc (dec, ch);
+    {
+      cc = ch;
+      mutt_decoder_push (dec, &cc, 1, &l);
+    }
 
     if (buf[2] == '=')
       break;
@@ -220,29 +240,45 @@ void mutt_decode_base64 (STATE *s, BODY *b, int istext)
     ch = ((c2 & 0xf) << 4) | (c3 >> 2);
 
     if (cr && ch != '\n')
-      mutt_decoder_putc (dec, ch);
+    {
+      cc = ch;
+      mutt_decoder_push (dec, &cc, 1, &l);
+    }
 
     cr = 0;
 
     if (istext && ch == '\r')
       cr = 1;
     else
-      mutt_decoder_putc (dec, ch);
+    {
+      cc = ch;
+      mutt_decoder_push (dec, &cc, 1, &l);
+    }
 
     if (buf[3] == '=') break;
     c4 = base64val (buf[3]);
     ch = ((c3 & 0x3) << 6) | c4;
 
     if (cr && ch != '\n')
-      mutt_decoder_putc (dec, ch);
+    {
+      cc = ch;
+      mutt_decoder_push (dec, &cc, 1, &l);
+    }
     cr = 0;
 
     if (istext && ch == '\r')
       cr = 1;
     else
-      mutt_decoder_putc (dec, ch);
+    {
+      cc = ch;
+      mutt_decoder_push (dec, &cc, 1, &l);
+    }
+    
+    mutt_decoder_pop_to_state (dec, s);
   }
-  mutt_close_decoder (&dec);
+  mutt_decoder_push (dec, NULL, 0, NULL);
+  mutt_decoder_pop_to_state (dec, s);
+
   state_reset_prefix(s);
 }
 
@@ -253,13 +289,13 @@ unsigned char decode_byte (char ch)
   return ch - 32;
 }
 
-void mutt_decode_uuencoded (STATE *s, BODY *b, int istext)
+void mutt_decode_uuencoded (STATE *s, BODY *b, int istext, DECODER *dec)
 {
   char tmps[SHORT_STRING];
   char linelen, c, l, out;
   char *pt;
   long len = b->length;
-  DECODER *dec = mutt_open_decoder (s, b, istext);
+  size_t dummy;
   
   if(istext)
     state_set_prefix(s);
@@ -289,16 +325,19 @@ void mutt_decode_uuencoded (STATE *s, BODY *b, int istext)
 	out = decode_byte (*pt) << l;
 	pt++;
 	out |= (decode_byte (*pt) >> (6 - l));
-	mutt_decoder_putc (dec, out);
+	mutt_decoder_push (dec, &out, 1, &dummy);
 	c++;
 	if (c == linelen)
 	  break;
       }
+      mutt_decoder_pop_to_state (dec, s);
       pt++;
     }
   }
 
-  mutt_close_decoder (&dec);
+  mutt_decoder_push (dec, NULL, 0, NULL);
+  mutt_decoder_pop_to_state (dec, s);
+  
   state_reset_prefix(s);
 }
 
@@ -1238,22 +1277,33 @@ static void external_body_handler (BODY *b, STATE *s)
 
 void mutt_decode_attachment (BODY *b, STATE *s)
 {
+  char *charset = mutt_get_parameter ("charset", b->parameter);
+  int istext = mutt_is_text_type (b->type, b->subtype);
+  DECODER *dec;
+
+  if (istext && s->flags & M_CHARCONV)
+    dec = mutt_open_decoder (charset, Charset);
+  else
+    dec = mutt_open_decoder (NULL, NULL);
+
   fseek (s->fpin, b->offset, 0);
   switch (b->encoding)
   {
     case ENCQUOTEDPRINTABLE:
-      mutt_decode_quoted (s, b, mutt_is_text_type (b->type, b->subtype));
+      mutt_decode_quoted (s, b, istext, dec);
       break;
     case ENCBASE64:
-      mutt_decode_base64 (s, b, mutt_is_text_type (b->type, b->subtype));
+      mutt_decode_base64 (s, b, istext, dec);
       break;
     case ENCUUENCODED:
-      mutt_decode_uuencoded (s, b, mutt_is_text_type (b->type, b->subtype));
+      mutt_decode_uuencoded (s, b, istext, dec);
       break;
     default:
-      mutt_decode_xbit (s, b, mutt_is_text_type (b->type, b->subtype));
+      mutt_decode_xbit (s, b, istext, dec);
       break;
   }
+
+  mutt_free_decoder (&dec);
 }
 
 void mutt_body_handler (BODY *b, STATE *s)
author	Thomas Roessler <roessler@does-not-exist.org>	1999-03-30 23:50:33 +0000
committer	Thomas Roessler <roessler@does-not-exist.org>	1999-03-30 23:50:33 +0000
commit	ec8c796bd17158c54f2415b1e71a82d309687126 (patch)
tree	472eb9f8f428af3c73f39c6e63ea1cebad36facd
parent	a784b853587106e064e2c72d2058b88223434a6a (diff)