[ 891491 ] Scrubber.py patch by tkikuchi

Modified Files: Mailman/Handlers/Scrubber.py Mailman/Defaults.py.in Fixes some bugs: charset problem where input/output charset are different text file without charset makes trouble when merging into one text msg. delete content-transfer encoding use Utils.oneline() for i18n filename use msg.walk() to collect parts Introduce: mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME to prevent extremely long fn.
author: tkikuchi <> 2004-09-17 03:00:43 +0000
committer: tkikuchi <> 2004-09-17 03:00:43 +0000
commit: 035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4 (patch)
tree: 3181ee126ea817f29de97239ad700302a67d7d77
parent: c8dd8a96e4e5fdd6a1456c171ecf5cfb09d54a73 (diff)
download: mailman2-035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4.tar.gz
mailman2-035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4.tar.xz
mailman2-035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4.zip
2 files changed, 48 insertions, 10 deletions
diff --git a/Mailman/Defaults.py.in b/Mailman/Defaults.py.in
index c9ee77b3..c8f48ff6 100644
--- a/Mailman/Defaults.py.in
+++ b/Mailman/Defaults.py.in
@@ -256,6 +256,11 @@ PRIVATE_EXTERNAL_ARCHIVER = No
 # should modify the Message object as necessary.
 ARCHIVE_SCRUBBER = 'Mailman.Handlers.Scrubber'
 
+# Mailman.Handlers.Scrubber uses attachment's filename as is.
+# If you don't like this (extremely long mime-encoded filename) then set 
+# this True.
+SCRUBBER_DONT_USE_ATTACHMENT_FILENAME = False
+
 # This variable defines what happens to text/html subparts.  They can be
 # stripped completely, escaped, or filtered through an external program.  The
 # legal values are:
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
index eccdc6e2..8c1124ec 100644
--- a/Mailman/Handlers/Scrubber.py
+++ b/Mailman/Handlers/Scrubber.py
@@ -27,11 +27,12 @@ import errno
 import binascii
 import tempfile
 from cStringIO import StringIO
-from types import IntType
+from types import IntType, StringType
 
 from email.Utils import parsedate
 from email.Parser import HeaderParser
 from email.Generator import Generator
+from email.Charset import Charset
 
 from Mailman import mm_cfg
 from Mailman import Utils
@@ -170,6 +171,7 @@ def process(mlist, msg, msgdata=None):
     dir = calculate_attachments_dir(mlist, msg, msgdata)
     charset = None
     lcset = Utils.GetCharSet(mlist.preferred_language)
+    lcset_out = Charset(lcset).output_charset or lcset
     # Now walk over all subparts of this message and scrub out various types
     for part in msg.walk():
         ctype = part.get_type(part.get_default_type())
@@ -180,11 +182,29 @@ def process(mlist, msg, msgdata=None):
             # message.
             if charset is None:
                 charset = part.get_content_charset(lcset)
+            # TK: if part is attached then check charset and scrub if none
+            if part.get('content-disposition') and \
+               not part.get_content_charset():
+                omask = os.umask(002)
+                try:
+                    url = save_attachment(mlist, part, dir)
+                finally:
+                    os.umask(omask)
+                filename = part.get_filename(_('not available'))
+                filename = Utils.oneline(filename, lcset)
+                del part['content-type']
+                del part['content-transfer-encoding']
+                part.set_payload(_("""\
+An embedded and charset-unspecified text was scrubbed...
+Name: %(filename)s
+Url: %(url)s
+"""), lcset)
         elif ctype == 'text/html' and isinstance(sanitize, IntType):
             if sanitize == 0:
                 if outer:
                     raise DiscardMessage
                 del part['content-type']
+                del part['content-transfer-encoding']
                 part.set_payload(_('HTML attachment scrubbed and removed'),
                                  # Adding charset arg and removing content-tpe
                                  # sets content-type to text/plain
@@ -202,6 +222,7 @@ def process(mlist, msg, msgdata=None):
                 finally:
                     os.umask(omask)
                 del part['content-type']
+                del part['content-transfer-encoding']
                 part.set_payload(_("""\
 An HTML attachment was scrubbed...
 URL: %(url)s
@@ -267,6 +288,7 @@ Url: %(url)s
                 os.umask(omask)
             desc = part.get('content-description', _('not available'))
             filename = part.get_filename(_('not available'))
+            filename = Utils.oneline(filename, lcset)
             del part['content-type']
             del part['content-transfer-encoding']
             part.set_payload(_("""\
@@ -285,8 +307,8 @@ Url : %(url)s
         # By default we take the charset of the first text/plain part in the
         # message, but if there was none, we'll use the list's preferred
         # language's charset.
-        if charset is None or charset == 'us-ascii':
-            charset = lcset
+        if not charset or charset == 'us-ascii':
+            charset = lcset_out
         # We now want to concatenate all the parts which have been scrubbed to
         # text/plain, into a single text/plain payload.  We need to make sure
         # all the characters in the concatenated string are in the same
@@ -294,17 +316,27 @@ Url : %(url)s
         # BAW: Martin's original patch suggested we might want to try
         # generalizing to utf-8, and that's probably a good idea (eventually).
         text = []
-        for part in msg.get_payload():
+        for part in msg.walk():
+            if part.get_content_maintype() == 'multipart':
+                continue
             # All parts should be scrubbed to text/plain by now.
             partctype = part.get_content_type()
             if partctype <> 'text/plain':
-                text.append(_('Skipped content of type %(partctype)s'))
+                text.append(_('Skipped content of type %(partctype)s\n'))
                 continue
             try:
                 t = part.get_payload(decode=True)
             except binascii.Error:
                 t = part.get_payload()
-            partcharset = part.get_content_charset()
+            # TK: get_content_charset() returns 'iso-2022-jp' for internally
+            # crafted (scrubbed) 'euc-jp' text part. So, first try 
+            # get_charset(), then get_content_charset() for the parts
+            # which are already embeded in the incoming message.
+            partcharset = part.get_charset()
+            if partcharset:
+                partcharset = str(partcharset)
+            else:
+                partcharset = part.get_content_charset()
             if partcharset and partcharset <> charset:
                 try:
                     t = unicode(t, partcharset, 'replace')
@@ -320,9 +352,10 @@ Url : %(url)s
                 except (UnicodeError, LookupError, ValueError):
                     t = t.encode(lcset, 'replace')
             # Separation is useful
-            if not t.endswith('\n'):
-                t += '\n'
-            text.append(t)
+            if isinstance(t, StringType):
+                if not t.endswith('\n'):
+                    t += '\n'
+                text.append(t)
         # Now join the text and set the payload
         sep = _('-------------- next part --------------\n')
         del msg['content-type']
@@ -376,7 +409,7 @@ def save_attachment(mlist, msg, dir, filter_html=True):
         # Now base the filename on what's in the attachment, uniquifying it if
         # necessary.
         filename = msg.get_filename()
-        if not filename:
+        if not filename or mm_cfg.SCRUBBER_DONT_USE_ATTACHMENT_FILENAME:
             filebase = 'attachment'
         else:
             # Sanitize the filename given in the message headers
author	tkikuchi <>	2004-09-17 03:00:43 +0000
committer	tkikuchi <>	2004-09-17 03:00:43 +0000
commit	035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4 (patch)
tree	3181ee126ea817f29de97239ad700302a67d7d77
parent	c8dd8a96e4e5fdd6a1456c171ecf5cfb09d54a73 (diff)
download	mailman2-035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4.tar.gz mailman2-035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4.tar.xz mailman2-035a0dfe1f1b2afd5d4fcb122a8d7026f506acd4.zip