diff options
Diffstat (limited to 'Mailman/Handlers/Scrubber.py')
-rw-r--r-- | Mailman/Handlers/Scrubber.py | 101 |
1 files changed, 72 insertions, 29 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py index 024832a4..b5be73df 100644 --- a/Mailman/Handlers/Scrubber.py +++ b/Mailman/Handlers/Scrubber.py @@ -17,6 +17,8 @@ """Cleanse a message for archiving. """ +from __future__ import nested_scopes + import os import re import sha @@ -24,7 +26,6 @@ import time import errno import binascii import tempfile -import mimetypes from cStringIO import StringIO from types import IntType @@ -51,6 +52,35 @@ dre = re.compile(r'^\.*') BR = '<br>\n' SPACE = ' ' +try: + from mimetypes import guess_all_extensions +except ImportError: + import mimetypes + def guess_all_extensions(ctype, strict=1): + # BAW: sigh, guess_all_extensions() is new in Python 2.3 + all = [] + def check(map): + for e, t in map.items(): + if t == ctype: + all.append(e) + check(mimetypes.types_map) + # Python 2.1 doesn't have common_types. Sigh, sigh. + if not strict and hasattr(mimetypes, 'common_types'): + check(mimetypes.common_types) + return all + + + +def guess_extension(ctype, ext): + # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot, + # and .wiz are all mapped to application/msword. This sucks for finding + # the best reverse mapping. If the extension is one of the giving + # mappings, we'll trust that, otherwise we'll just guess. :/ + all = guess_all_extensions(ctype, strict=0) + if ext in all: + return ext + return all and all[0] + # We're using a subclass of the standard Generator because we want to suppress @@ -131,6 +161,7 @@ def process(mlist, msg, msgdata=None): msgdata = {} dir = calculate_attachments_dir(mlist, msg, msgdata) charset = None + lcset = Utils.GetCharSet(mlist.preferred_language) # Now walk over all subparts of this message and scrub out various types for part in msg.walk(): ctype = part.get_type(part.get_default_type()) @@ -140,13 +171,16 @@ def process(mlist, msg, msgdata=None): # arbitrarily pick the charset of the first text/plain part in the # message. if charset is None: - charset = part.get_content_charset(charset) + charset = part.get_content_charset(lcset) elif ctype == 'text/html' and isinstance(sanitize, IntType): if sanitize == 0: if outer: raise DiscardMessage - part.set_payload(_('HTML attachment scrubbed and removed')) - part.set_type('text/plain') + del part['content-type'] + part.set_payload(_('HTML attachment scrubbed and removed'), + # Adding charset arg and removing content-tpe + # sets content-type to text/plain + lcset) elif sanitize == 2: # By leaving it alone, Pipermail will automatically escape it pass @@ -159,11 +193,11 @@ def process(mlist, msg, msgdata=None): url = save_attachment(mlist, part, dir, filter_html=0) finally: os.umask(omask) + del part['content-type'] part.set_payload(_("""\ An HTML attachment was scrubbed... URL: %(url)s -""")) - part.set_type('text/plain') +"""), lcset) else: # HTML-escape it and store it as an attachment, but make it # look a /little/ bit prettier. :( @@ -185,11 +219,11 @@ URL: %(url)s url = save_attachment(mlist, part, dir, filter_html=0) finally: os.umask(omask) + del part['content-type'] part.set_payload(_("""\ An HTML attachment was scrubbed... URL: %(url)s -""")) - part.set_type('text/plain') +"""), lcset) elif ctype == 'message/rfc822': # This part contains a submessage, so it too needs scrubbing submsg = part.get_payload(0) @@ -202,6 +236,7 @@ URL: %(url)s date = submsg.get('date', _('no date')) who = submsg.get('from', _('unknown sender')) size = len(str(submsg)) + del part['content-type'] part.set_payload(_("""\ An embedded message was scrubbed... From: %(who)s @@ -209,13 +244,12 @@ Subject: %(subject)s Date: %(date)s Size: %(size)s Url: %(url)s -""")) - part.set_type('text/plain') +"""), lcset) # If the message isn't a multipart, then we'll strip it out as an # attachment that would have to be separately downloaded. Pipermail # will transform the url into a hyperlink. elif not part.is_multipart(): - payload = part.get_payload() + payload = part.get_payload(decode=1) ctype = part.get_type() size = len(payload) omask = os.umask(002) @@ -225,6 +259,8 @@ Url: %(url)s os.umask(omask) desc = part.get('content-description', _('not available')) filename = part.get_filename(_('not available')) + del part['content-type'] + del part['content-transfer-encoding'] part.set_payload(_("""\ A non-text attachment was scrubbed... Name: %(filename)s @@ -232,8 +268,7 @@ Type: %(ctype)s Size: %(size)d bytes Desc: %(desc)s Url : %(url)s -""")) - part.set_type('text/plain') +"""), lcset) outer = 0 # We still have to sanitize multipart messages to flat text because # Pipermail can't handle messages with list payloads. This is a kludge; @@ -242,8 +277,8 @@ Url : %(url)s # By default we take the charset of the first text/plain part in the # message, but if there was none, we'll use the list's preferred # language's charset. - if charset is None: - charset = Utils.GetCharSet(mlist.preferred_language) + if charset is None or charset == 'us-ascii': + charset = lcset # We now want to concatenate all the parts which have been scrubbed to # text/plain, into a single text/plain payload. We need to make sure # all the characters in the concatenated string are in the same @@ -261,20 +296,26 @@ Url : %(url)s t = part.get_payload(decode=1) except binascii.Error: t = part.get_payload() - partcharset = part.get_charset() + partcharset = part.get_content_charset() if partcharset and partcharset <> charset: try: t = unicode(t, partcharset, 'replace') - # Should use HTML-Escape, or try generalizing to UTF-8 - t = t.encode(charset, 'replace') - except UnicodeError: + except (UnicodeError, LookupError): # Replace funny characters t = unicode(t, 'ascii', 'replace').encode('ascii') + try: + # Should use HTML-Escape, or try generalizing to UTF-8 + t = t.encode(charset, 'replace') + except (UnicodeError, LookupError): + t = t.encode(lcset, 'replace') + # Separation is useful + if not t.endswith('\n'): + t += '\n' text.append(t) # Now join the text and set the payload sep = _('-------------- next part --------------\n') + del msg['content-type'] msg.set_payload(sep.join(text), charset) - msg.set_type('text/plain') del msg['content-transfer-encoding'] msg.add_header('Content-Transfer-Encoding', '8bit') return msg @@ -285,13 +326,13 @@ def makedirs(dir): # Create all the directories to store this attachment in try: os.makedirs(dir, 02775) + # Unfortunately, FreeBSD seems to be broken in that it doesn't honor + # the mode arg of mkdir(). + def twiddle(arg, dirname, names): + os.chmod(dirname, 02775) + os.path.walk(dir, twiddle, None) except OSError, e: if e.errno <> errno.EEXIST: raise - # Unfortunately, FreeBSD seems to be broken in that it doesn't honor the - # mode arg of mkdir(). - def twiddle(arg, dirname, names): - os.chmod(dirname, 02775) - os.path.walk(dir, twiddle, None) @@ -303,13 +344,15 @@ def save_attachment(mlist, msg, dir, filter_html=1): # BAW: mimetypes ought to handle non-standard, but commonly found types, # e.g. image/jpg (should be image/jpeg). For now we just store such # things as application/octet-streams since that seems the safest. - ext = mimetypes.guess_extension(msg.get_type()) + ctype = msg.get_content_type() + fnext = os.path.splitext(msg.get_filename(''))[1] + ext = guess_extension(ctype, fnext) if not ext: # We don't know what it is, so assume it's just a shapeless # application/octet-stream, unless the Content-Type: is # message/rfc822, in which case we know we'll coerce the type to # text/plain below. - if msg.get_type() == 'message/rfc822': + if ctype == 'message/rfc822': ext = '.txt' else: ext = '.bin' @@ -361,7 +404,7 @@ def save_attachment(mlist, msg, dir, filter_html=1): # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be # here), then send the attachment through the filter program for # sanitization - if filter_html and msg.get_type() == 'text/html': + if filter_html and ctype == 'text/html': base, ext = os.path.splitext(path) tmppath = base + '-tmp' + ext fp = open(tmppath, 'w') @@ -384,7 +427,7 @@ def save_attachment(mlist, msg, dir, filter_html=1): ext = '.txt' path = base + '.txt' # Is it a message/rfc822 attachment? - elif msg.get_type() == 'message/rfc822': + elif ctype == 'message/rfc822': submsg = msg.get_payload() # BAW: I'm sure we can eventually do better than this. :( decodedpayload = Utils.websafe(str(submsg)) |