From 035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4 Mon Sep 17 00:00:00 2001 From: tkikuchi <> Date: Fri, 17 Sep 2004 03:00:43 +0000 Subject: [ 891491 ] Scrubber.py patch by tkikuchi Modified Files: Mailman/Handlers/Scrubber.py Mailman/Defaults.py.in Fixes some bugs: charset problem where input/output charset are different text file without charset makes trouble when merging into one text msg. delete content-transfer encoding use Utils.oneline() for i18n filename use msg.walk() to collect parts Introduce: mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME to prevent extremely long fn. --- Mailman/Defaults.py.in | 5 +++++ Mailman/Handlers/Scrubber.py | 53 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/Mailman/Defaults.py.in b/Mailman/Defaults.py.in index c9ee77b3..c8f48ff6 100644 --- a/Mailman/Defaults.py.in +++ b/Mailman/Defaults.py.in @@ -256,6 +256,11 @@ PRIVATE_EXTERNAL_ARCHIVER = No # should modify the Message object as necessary. ARCHIVE_SCRUBBER = 'Mailman.Handlers.Scrubber' +# Mailman.Handlers.Scrubber uses attachment's filename as is. +# If you don't like this (extremely long mime-encoded filename) then set +# this True. +SCRUBBER_DONT_USE_ATTACHMENT_FILENAME = False + # This variable defines what happens to text/html subparts. They can be # stripped completely, escaped, or filtered through an external program. The # legal values are: diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py index eccdc6e2..8c1124ec 100644 --- a/Mailman/Handlers/Scrubber.py +++ b/Mailman/Handlers/Scrubber.py @@ -27,11 +27,12 @@ import errno import binascii import tempfile from cStringIO import StringIO -from types import IntType +from types import IntType, StringType from email.Utils import parsedate from email.Parser import HeaderParser from email.Generator import Generator +from email.Charset import Charset from Mailman import mm_cfg from Mailman import Utils @@ -170,6 +171,7 @@ def process(mlist, msg, msgdata=None): dir = calculate_attachments_dir(mlist, msg, msgdata) charset = None lcset = Utils.GetCharSet(mlist.preferred_language) + lcset_out = Charset(lcset).output_charset or lcset # Now walk over all subparts of this message and scrub out various types for part in msg.walk(): ctype = part.get_type(part.get_default_type()) @@ -180,11 +182,29 @@ def process(mlist, msg, msgdata=None): # message. if charset is None: charset = part.get_content_charset(lcset) + # TK: if part is attached then check charset and scrub if none + if part.get('content-disposition') and \ + not part.get_content_charset(): + omask = os.umask(002) + try: + url = save_attachment(mlist, part, dir) + finally: + os.umask(omask) + filename = part.get_filename(_('not available')) + filename = Utils.oneline(filename, lcset) + del part['content-type'] + del part['content-transfer-encoding'] + part.set_payload(_("""\ +An embedded and charset-unspecified text was scrubbed... +Name: %(filename)s +Url: %(url)s +"""), lcset) elif ctype == 'text/html' and isinstance(sanitize, IntType): if sanitize == 0: if outer: raise DiscardMessage del part['content-type'] + del part['content-transfer-encoding'] part.set_payload(_('HTML attachment scrubbed and removed'), # Adding charset arg and removing content-tpe # sets content-type to text/plain @@ -202,6 +222,7 @@ def process(mlist, msg, msgdata=None): finally: os.umask(omask) del part['content-type'] + del part['content-transfer-encoding'] part.set_payload(_("""\ An HTML attachment was scrubbed... URL: %(url)s @@ -267,6 +288,7 @@ Url: %(url)s os.umask(omask) desc = part.get('content-description', _('not available')) filename = part.get_filename(_('not available')) + filename = Utils.oneline(filename, lcset) del part['content-type'] del part['content-transfer-encoding'] part.set_payload(_("""\ @@ -285,8 +307,8 @@ Url : %(url)s # By default we take the charset of the first text/plain part in the # message, but if there was none, we'll use the list's preferred # language's charset. - if charset is None or charset == 'us-ascii': - charset = lcset + if not charset or charset == 'us-ascii': + charset = lcset_out # We now want to concatenate all the parts which have been scrubbed to # text/plain, into a single text/plain payload. We need to make sure # all the characters in the concatenated string are in the same @@ -294,17 +316,27 @@ Url : %(url)s # BAW: Martin's original patch suggested we might want to try # generalizing to utf-8, and that's probably a good idea (eventually). text = [] - for part in msg.get_payload(): + for part in msg.walk(): + if part.get_content_maintype() == 'multipart': + continue # All parts should be scrubbed to text/plain by now. partctype = part.get_content_type() if partctype <> 'text/plain': - text.append(_('Skipped content of type %(partctype)s')) + text.append(_('Skipped content of type %(partctype)s\n')) continue try: t = part.get_payload(decode=True) except binascii.Error: t = part.get_payload() - partcharset = part.get_content_charset() + # TK: get_content_charset() returns 'iso-2022-jp' for internally + # crafted (scrubbed) 'euc-jp' text part. So, first try + # get_charset(), then get_content_charset() for the parts + # which are already embeded in the incoming message. + partcharset = part.get_charset() + if partcharset: + partcharset = str(partcharset) + else: + partcharset = part.get_content_charset() if partcharset and partcharset <> charset: try: t = unicode(t, partcharset, 'replace') @@ -320,9 +352,10 @@ Url : %(url)s except (UnicodeError, LookupError, ValueError): t = t.encode(lcset, 'replace') # Separation is useful - if not t.endswith('\n'): - t += '\n' - text.append(t) + if isinstance(t, StringType): + if not t.endswith('\n'): + t += '\n' + text.append(t) # Now join the text and set the payload sep = _('-------------- next part --------------\n') del msg['content-type'] @@ -376,7 +409,7 @@ def save_attachment(mlist, msg, dir, filter_html=True): # Now base the filename on what's in the attachment, uniquifying it if # necessary. filename = msg.get_filename() - if not filename: + if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME: filebase = 'attachment' else: # Sanitize the filename given in the message headers -- cgit v1.2.3