1 files changed, 72 insertions, 29 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 024832a4..b5be73df 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -17,6 +17,8 @@
 """Cleanse a message for archiving.
 """
 
+from __future__ import nested_scopes
+
 import os
 import re
 import sha
@@ -24,7 +26,6 @@ import time
 import errno
 import binascii
 import tempfile
-import mimetypes
 from cStringIO import StringIO
 from types import IntType
 
@@ -51,6 +52,35 @@ dre = re.compile(r'^\.*')
 BR = '<br>\n'
 SPACE = ' '
 
+try:
+    from mimetypes import guess_all_extensions
+except ImportError:
+    import mimetypes
+    def guess_all_extensions(ctype, strict=1):
+        # BAW: sigh, guess_all_extensions() is new in Python 2.3
+        all = []
+        def check(map):
+            for e, t in map.items():
+                if t == ctype:
+                    all.append(e)
+        check(mimetypes.types_map)
+        # Python 2.1 doesn't have common_types.  Sigh, sigh.
+        if not strict and hasattr(mimetypes, 'common_types'):
+            check(mimetypes.common_types)
+        return all
+
+
+
+def guess_extension(ctype, ext):
+    # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
+    # and .wiz are all mapped to application/msword.  This sucks for finding
+    # the best reverse mapping.  If the extension is one of the giving
+    # mappings, we'll trust that, otherwise we'll just guess. :/
+    all = guess_all_extensions(ctype, strict=0)
+    if ext in all:
+        return ext
+    return all and all[0]
+
 
 
 # We're using a subclass of the standard Generator because we want to suppress
@@ -131,6 +161,7 @@ def process(mlist, msg, msgdata=None):
         msgdata = {}
     dir = calculate_attachments_dir(mlist, msg, msgdata)
     charset = None
+    lcset = Utils.GetCharSet(mlist.preferred_language)
     # Now walk over all subparts of this message and scrub out various types
     for part in msg.walk():
         ctype = part.get_type(part.get_default_type())
@@ -140,13 +171,16 @@ def process(mlist, msg, msgdata=None):
             # arbitrarily pick the charset of the first text/plain part in the
             # message.
             if charset is None:
-                charset = part.get_content_charset(charset)
+                charset = part.get_content_charset(lcset)
         elif ctype == 'text/html' and isinstance(sanitize, IntType):
             if sanitize == 0:
                 if outer:
                     raise DiscardMessage
-                part.set_payload(_('HTML attachment scrubbed and removed'))
-                part.set_type('text/plain')
+                del part['content-type']
+                part.set_payload(_('HTML attachment scrubbed and removed'),
+                                 # Adding charset arg and removing content-tpe
+                                 # sets content-type to text/plain
+                                 lcset)
             elif sanitize == 2:
                 # By leaving it alone, Pipermail will automatically escape it
                 pass
@@ -159,11 +193,11 @@ def process(mlist, msg, msgdata=None):
                     url = save_attachment(mlist, part, dir, filter_html=0)
                 finally:
                     os.umask(omask)
+                del part['content-type']
                 part.set_payload(_("""\
 An HTML attachment was scrubbed...
 URL: %(url)s
-"""))
-                part.set_type('text/plain')
+"""), lcset)
             else:
                 # HTML-escape it and store it as an attachment, but make it
                 # look a /little/ bit prettier. :(
@@ -185,11 +219,11 @@ URL: %(url)s
                     url = save_attachment(mlist, part, dir, filter_html=0)
                 finally:
                     os.umask(omask)
+                del part['content-type']
                 part.set_payload(_("""\
 An HTML attachment was scrubbed...
 URL: %(url)s
-"""))
-                part.set_type('text/plain')
+"""), lcset)
         elif ctype == 'message/rfc822':
             # This part contains a submessage, so it too needs scrubbing
             submsg = part.get_payload(0)
@@ -202,6 +236,7 @@ URL: %(url)s
             date = submsg.get('date', _('no date'))
             who = submsg.get('from', _('unknown sender'))
             size = len(str(submsg))
+            del part['content-type']
             part.set_payload(_("""\
 An embedded message was scrubbed...
 From: %(who)s
@@ -209,13 +244,12 @@ Subject: %(subject)s
 Date: %(date)s
 Size: %(size)s
 Url: %(url)s
-"""))
-            part.set_type('text/plain')
+"""), lcset)
         # If the message isn't a multipart, then we'll strip it out as an
         # attachment that would have to be separately downloaded.  Pipermail
         # will transform the url into a hyperlink.
         elif not part.is_multipart():
-            payload = part.get_payload()
+            payload = part.get_payload(decode=1)
             ctype = part.get_type()
             size = len(payload)
             omask = os.umask(002)
@@ -225,6 +259,8 @@ Url: %(url)s
                 os.umask(omask)
             desc = part.get('content-description', _('not available'))
             filename = part.get_filename(_('not available'))
+            del part['content-type']
+            del part['content-transfer-encoding']
             part.set_payload(_("""\
 A non-text attachment was scrubbed...
 Name: %(filename)s
@@ -232,8 +268,7 @@ Type: %(ctype)s
 Size: %(size)d bytes
 Desc: %(desc)s
 Url : %(url)s
-"""))
-            part.set_type('text/plain')
+"""), lcset)
         outer = 0
     # We still have to sanitize multipart messages to flat text because
     # Pipermail can't handle messages with list payloads.  This is a kludge;
@@ -242,8 +277,8 @@ Url : %(url)s
         # By default we take the charset of the first text/plain part in the
         # message, but if there was none, we'll use the list's preferred
         # language's charset.
-        if charset is None:
-            charset = Utils.GetCharSet(mlist.preferred_language)
+        if charset is None or charset == 'us-ascii':
+            charset = lcset
         # We now want to concatenate all the parts which have been scrubbed to
         # text/plain, into a single text/plain payload.  We need to make sure
         # all the characters in the concatenated string are in the same
@@ -261,20 +296,26 @@ Url : %(url)s
                 t = part.get_payload(decode=1)
             except binascii.Error:
                 t = part.get_payload()
-            partcharset = part.get_charset()
+            partcharset = part.get_content_charset()
             if partcharset and partcharset <> charset:
                 try:
                     t = unicode(t, partcharset, 'replace')
-                    # Should use HTML-Escape, or try generalizing to UTF-8
-                    t = t.encode(charset, 'replace')
-                except UnicodeError:
+                except (UnicodeError, LookupError):
                     # Replace funny characters
                     t = unicode(t, 'ascii', 'replace').encode('ascii')
+                try:
+                    # Should use HTML-Escape, or try generalizing to UTF-8
+                    t = t.encode(charset, 'replace')
+                except (UnicodeError, LookupError):
+                    t = t.encode(lcset, 'replace')
+            # Separation is useful
+            if not t.endswith('\n'):
+                t += '\n'
             text.append(t)
         # Now join the text and set the payload
         sep = _('-------------- next part --------------\n')
+        del msg['content-type']
         msg.set_payload(sep.join(text), charset)
-        msg.set_type('text/plain')
         del msg['content-transfer-encoding']
         msg.add_header('Content-Transfer-Encoding', '8bit')
     return msg
@@ -285,13 +326,13 @@ def makedirs(dir):
     # Create all the directories to store this attachment in
     try:
         os.makedirs(dir, 02775)
+        # Unfortunately, FreeBSD seems to be broken in that it doesn't honor
+        # the mode arg of mkdir().
+        def twiddle(arg, dirname, names):
+            os.chmod(dirname, 02775)
+        os.path.walk(dir, twiddle, None)
     except OSError, e:
         if e.errno <> errno.EEXIST: raise
-    # Unfortunately, FreeBSD seems to be broken in that it doesn't honor the
-    # mode arg of mkdir().
-    def twiddle(arg, dirname, names):
-        os.chmod(dirname, 02775)
-    os.path.walk(dir, twiddle, None)
 
 
 
@@ -303,13 +344,15 @@ def save_attachment(mlist, msg, dir, filter_html=1):
     # BAW: mimetypes ought to handle non-standard, but commonly found types,
     # e.g. image/jpg (should be image/jpeg).  For now we just store such
     # things as application/octet-streams since that seems the safest.
-    ext = mimetypes.guess_extension(msg.get_type())
+    ctype = msg.get_content_type()
+    fnext = os.path.splitext(msg.get_filename(''))[1]
+    ext = guess_extension(ctype, fnext)
     if not ext:
         # We don't know what it is, so assume it's just a shapeless
         # application/octet-stream, unless the Content-Type: is
         # message/rfc822, in which case we know we'll coerce the type to
         # text/plain below.
-        if msg.get_type() == 'message/rfc822':
+        if ctype == 'message/rfc822':
             ext = '.txt'
         else:
             ext = '.bin'
@@ -361,7 +404,7 @@ def save_attachment(mlist, msg, dir, filter_html=1):
     # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
     # here), then send the attachment through the filter program for
     # sanitization
-    if filter_html and msg.get_type() == 'text/html':
+    if filter_html and ctype == 'text/html':
         base, ext = os.path.splitext(path)
         tmppath = base + '-tmp' + ext
         fp = open(tmppath, 'w')
@@ -384,7 +427,7 @@ def save_attachment(mlist, msg, dir, filter_html=1):
         ext = '.txt'
         path = base + '.txt'
     # Is it a message/rfc822 attachment?
-    elif msg.get_type() == 'message/rfc822':
+    elif ctype == 'message/rfc822':
         submsg = msg.get_payload()
         # BAW: I'm sure we can eventually do better than this. :(
         decodedpayload = Utils.websafe(str(submsg))