aboutsummaryrefslogtreecommitdiffstats
path: root/Mailman/Handlers/Scrubber.py
diff options
context:
space:
mode:
Diffstat (limited to 'Mailman/Handlers/Scrubber.py')
-rw-r--r--Mailman/Handlers/Scrubber.py101
1 files changed, 72 insertions, 29 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index 024832a4..b5be73df 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -17,6 +17,8 @@
"""Cleanse a message for archiving.
"""
+from __future__ import nested_scopes
+
import os
import re
import sha
@@ -24,7 +26,6 @@ import time
import errno
import binascii
import tempfile
-import mimetypes
from cStringIO import StringIO
from types import IntType
@@ -51,6 +52,35 @@ dre = re.compile(r'^\.*')
BR = '<br>\n'
SPACE = ' '
+try:
+ from mimetypes import guess_all_extensions
+except ImportError:
+ import mimetypes
+ def guess_all_extensions(ctype, strict=1):
+ # BAW: sigh, guess_all_extensions() is new in Python 2.3
+ all = []
+ def check(map):
+ for e, t in map.items():
+ if t == ctype:
+ all.append(e)
+ check(mimetypes.types_map)
+ # Python 2.1 doesn't have common_types. Sigh, sigh.
+ if not strict and hasattr(mimetypes, 'common_types'):
+ check(mimetypes.common_types)
+ return all
+
+
+
+def guess_extension(ctype, ext):
+ # mimetypes maps multiple extensions to the same type, e.g. .doc, .dot,
+ # and .wiz are all mapped to application/msword. This sucks for finding
+ # the best reverse mapping. If the extension is one of the giving
+ # mappings, we'll trust that, otherwise we'll just guess. :/
+ all = guess_all_extensions(ctype, strict=0)
+ if ext in all:
+ return ext
+ return all and all[0]
+
# We're using a subclass of the standard Generator because we want to suppress
@@ -131,6 +161,7 @@ def process(mlist, msg, msgdata=None):
msgdata = {}
dir = calculate_attachments_dir(mlist, msg, msgdata)
charset = None
+ lcset = Utils.GetCharSet(mlist.preferred_language)
# Now walk over all subparts of this message and scrub out various types
for part in msg.walk():
ctype = part.get_type(part.get_default_type())
@@ -140,13 +171,16 @@ def process(mlist, msg, msgdata=None):
# arbitrarily pick the charset of the first text/plain part in the
# message.
if charset is None:
- charset = part.get_content_charset(charset)
+ charset = part.get_content_charset(lcset)
elif ctype == 'text/html' and isinstance(sanitize, IntType):
if sanitize == 0:
if outer:
raise DiscardMessage
- part.set_payload(_('HTML attachment scrubbed and removed'))
- part.set_type('text/plain')
+ del part['content-type']
+ part.set_payload(_('HTML attachment scrubbed and removed'),
+ # Adding charset arg and removing content-tpe
+ # sets content-type to text/plain
+ lcset)
elif sanitize == 2:
# By leaving it alone, Pipermail will automatically escape it
pass
@@ -159,11 +193,11 @@ def process(mlist, msg, msgdata=None):
url = save_attachment(mlist, part, dir, filter_html=0)
finally:
os.umask(omask)
+ del part['content-type']
part.set_payload(_("""\
An HTML attachment was scrubbed...
URL: %(url)s
-"""))
- part.set_type('text/plain')
+"""), lcset)
else:
# HTML-escape it and store it as an attachment, but make it
# look a /little/ bit prettier. :(
@@ -185,11 +219,11 @@ URL: %(url)s
url = save_attachment(mlist, part, dir, filter_html=0)
finally:
os.umask(omask)
+ del part['content-type']
part.set_payload(_("""\
An HTML attachment was scrubbed...
URL: %(url)s
-"""))
- part.set_type('text/plain')
+"""), lcset)
elif ctype == 'message/rfc822':
# This part contains a submessage, so it too needs scrubbing
submsg = part.get_payload(0)
@@ -202,6 +236,7 @@ URL: %(url)s
date = submsg.get('date', _('no date'))
who = submsg.get('from', _('unknown sender'))
size = len(str(submsg))
+ del part['content-type']
part.set_payload(_("""\
An embedded message was scrubbed...
From: %(who)s
@@ -209,13 +244,12 @@ Subject: %(subject)s
Date: %(date)s
Size: %(size)s
Url: %(url)s
-"""))
- part.set_type('text/plain')
+"""), lcset)
# If the message isn't a multipart, then we'll strip it out as an
# attachment that would have to be separately downloaded. Pipermail
# will transform the url into a hyperlink.
elif not part.is_multipart():
- payload = part.get_payload()
+ payload = part.get_payload(decode=1)
ctype = part.get_type()
size = len(payload)
omask = os.umask(002)
@@ -225,6 +259,8 @@ Url: %(url)s
os.umask(omask)
desc = part.get('content-description', _('not available'))
filename = part.get_filename(_('not available'))
+ del part['content-type']
+ del part['content-transfer-encoding']
part.set_payload(_("""\
A non-text attachment was scrubbed...
Name: %(filename)s
@@ -232,8 +268,7 @@ Type: %(ctype)s
Size: %(size)d bytes
Desc: %(desc)s
Url : %(url)s
-"""))
- part.set_type('text/plain')
+"""), lcset)
outer = 0
# We still have to sanitize multipart messages to flat text because
# Pipermail can't handle messages with list payloads. This is a kludge;
@@ -242,8 +277,8 @@ Url : %(url)s
# By default we take the charset of the first text/plain part in the
# message, but if there was none, we'll use the list's preferred
# language's charset.
- if charset is None:
- charset = Utils.GetCharSet(mlist.preferred_language)
+ if charset is None or charset == 'us-ascii':
+ charset = lcset
# We now want to concatenate all the parts which have been scrubbed to
# text/plain, into a single text/plain payload. We need to make sure
# all the characters in the concatenated string are in the same
@@ -261,20 +296,26 @@ Url : %(url)s
t = part.get_payload(decode=1)
except binascii.Error:
t = part.get_payload()
- partcharset = part.get_charset()
+ partcharset = part.get_content_charset()
if partcharset and partcharset <> charset:
try:
t = unicode(t, partcharset, 'replace')
- # Should use HTML-Escape, or try generalizing to UTF-8
- t = t.encode(charset, 'replace')
- except UnicodeError:
+ except (UnicodeError, LookupError):
# Replace funny characters
t = unicode(t, 'ascii', 'replace').encode('ascii')
+ try:
+ # Should use HTML-Escape, or try generalizing to UTF-8
+ t = t.encode(charset, 'replace')
+ except (UnicodeError, LookupError):
+ t = t.encode(lcset, 'replace')
+ # Separation is useful
+ if not t.endswith('\n'):
+ t += '\n'
text.append(t)
# Now join the text and set the payload
sep = _('-------------- next part --------------\n')
+ del msg['content-type']
msg.set_payload(sep.join(text), charset)
- msg.set_type('text/plain')
del msg['content-transfer-encoding']
msg.add_header('Content-Transfer-Encoding', '8bit')
return msg
@@ -285,13 +326,13 @@ def makedirs(dir):
# Create all the directories to store this attachment in
try:
os.makedirs(dir, 02775)
+ # Unfortunately, FreeBSD seems to be broken in that it doesn't honor
+ # the mode arg of mkdir().
+ def twiddle(arg, dirname, names):
+ os.chmod(dirname, 02775)
+ os.path.walk(dir, twiddle, None)
except OSError, e:
if e.errno <> errno.EEXIST: raise
- # Unfortunately, FreeBSD seems to be broken in that it doesn't honor the
- # mode arg of mkdir().
- def twiddle(arg, dirname, names):
- os.chmod(dirname, 02775)
- os.path.walk(dir, twiddle, None)
@@ -303,13 +344,15 @@ def save_attachment(mlist, msg, dir, filter_html=1):
# BAW: mimetypes ought to handle non-standard, but commonly found types,
# e.g. image/jpg (should be image/jpeg). For now we just store such
# things as application/octet-streams since that seems the safest.
- ext = mimetypes.guess_extension(msg.get_type())
+ ctype = msg.get_content_type()
+ fnext = os.path.splitext(msg.get_filename(''))[1]
+ ext = guess_extension(ctype, fnext)
if not ext:
# We don't know what it is, so assume it's just a shapeless
# application/octet-stream, unless the Content-Type: is
# message/rfc822, in which case we know we'll coerce the type to
# text/plain below.
- if msg.get_type() == 'message/rfc822':
+ if ctype == 'message/rfc822':
ext = '.txt'
else:
ext = '.bin'
@@ -361,7 +404,7 @@ def save_attachment(mlist, msg, dir, filter_html=1):
# ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
# here), then send the attachment through the filter program for
# sanitization
- if filter_html and msg.get_type() == 'text/html':
+ if filter_html and ctype == 'text/html':
base, ext = os.path.splitext(path)
tmppath = base + '-tmp' + ext
fp = open(tmppath, 'w')
@@ -384,7 +427,7 @@ def save_attachment(mlist, msg, dir, filter_html=1):
ext = '.txt'
path = base + '.txt'
# Is it a message/rfc822 attachment?
- elif msg.get_type() == 'message/rfc822':
+ elif ctype == 'message/rfc822':
submsg = msg.get_payload()
# BAW: I'm sure we can eventually do better than this. :(
decodedpayload = Utils.websafe(str(submsg))