1 files changed, 400 insertions, 0 deletions
diff --git a/Mailman/Handlers/Scrubber.py b/Mailman/Handlers/Scrubber.py
new file mode 100644
index 00000000..5dabadf3
--- /dev/null
+++ b/Mailman/Handlers/Scrubber.py
@@ -0,0 +1,400 @@
+# Copyright (C) 2001,2002 by the Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software 
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+"""Cleanse a message for archiving.
+"""
+
+import os
+import re
+import sha
+import time
+import errno
+import binascii
+import tempfile
+import mimetypes
+from cStringIO import StringIO
+from types import IntType
+
+from email.Utils import parsedate
+from email.Parser import HeaderParser
+from email.Generator import Generator
+
+from Mailman import mm_cfg
+from Mailman import Utils
+from Mailman import LockFile
+from Mailman import Message
+from Mailman.Errors import DiscardMessage
+from Mailman.i18n import _
+from Mailman.Logging.Syslog import syslog
+
+# Path characters for common platforms
+pre = re.compile(r'[/\\:]')
+# All other characters to strip out of Content-Disposition: filenames
+# (essentially anything that isn't an alphanum, dot, slash, or underscore.
+sre = re.compile(r'[^-\w.]')
+# Regexp to strip out leading dots
+dre = re.compile(r'^\.*')
+
+BR = '<br>\n'
+SPACE = ' '
+
+
+
+# We're using a subclass of the standard Generator because we want to suppress
+# headers in the subparts of multiparts.  We use a hack -- the ctor argument
+# skipheaders to accomplish this.  It's set to true for the outer Message
+# object, but false for all internal objects.  We recognize that
+# sub-Generators will get created passing only mangle_from_ and maxheaderlen
+# to the ctors.
+#
+# This isn't perfect because we still get stuff like the multipart boundaries,
+# but see below for how we corrupt that to our nefarious goals.
+class ScrubberGenerator(Generator):
+    def __init__(self, outfp, mangle_from_=1, maxheaderlen=78, skipheaders=1):
+        Generator.__init__(self, outfp, mangle_from_=0)
+        self.__skipheaders = skipheaders
+
+    def _write_headers(self, msg):
+        if not self.__skipheaders:
+            Generator._write_headers(self, msg)
+
+
+def safe_strftime(fmt, floatsecs):
+    try:
+        return time.strftime(fmt, floatsecs)
+    except ValueError:
+        return None
+
+
+def calculate_attachments_dir(mlist, msg, msgdata):
+    # Calculate the directory that attachments for this message will go
+    # under.  To avoid inode limitations, the scheme will be:
+    # archives/private/<listname>/attachments/YYYYMMDD/<msgid-hash>/<files>
+    # Start by calculating the date-based and msgid-hash components.
+    fmt = '%Y%m%d'
+    datestr = msg.get('Date')
+    if datestr:
+        now = parsedate(datestr)
+    else:
+        now = time.gmtime(msgdata.get('received_time', time.time()))
+    datedir = safe_strftime(fmt, now)
+    if not datedir:
+        datestr = msgdata.get('X-List-Received-Date')
+        if datestr:
+            datedir = safe_strftime(fmt, datestr)
+    if not datedir:
+        # What next?  Unixfrom, I guess.
+        parts = msg.get_unixfrom().split()
+        try:
+            month = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
+                     'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
+                     }.get(parts[3], 0)
+            day = int(parts[4])
+            year = int(parts[6])
+        except (IndexError, ValueError):
+            # Best we can do I think
+            month = day = year = 0
+        datedir = '%04d%02d%02d' % (year, month, day)
+    assert datedir
+    # As for the msgid hash, we'll base this part on the Message-ID: so that
+    # all attachments for the same message end up in the same directory (we'll
+    # uniquify the filenames in that directory as needed).  We use the first 2
+    # and last 2 bytes of the SHA1 hash of the message id as the basis of the
+    # directory name.  Clashes here don't really matter too much, and that
+    # still gives us a 32-bit space to work with.
+    msgid = msg['message-id']
+    if msgid is None:
+        msgid = msg['Message-ID'] = Utils.unique_message_id(mlist)
+    # We assume that the message id actually /is/ unique!
+    digest = sha.new(msgid).hexdigest()
+    return os.path.join('attachments', datedir, digest[:4] + digest[-4:])
+
+
+
+def process(mlist, msg, msgdata=None):
+    sanitize = mm_cfg.ARCHIVE_HTML_SANITIZER
+    outer = 1
+    if msgdata is None:
+        msgdata = {}
+    dir = calculate_attachments_dir(mlist, msg, msgdata)
+    charset = None
+    # Now walk over all subparts of this message and scrub out various types
+    for part in msg.walk():
+        ctype = part.get_type(part.get_default_type())
+        # If the part is text/plain, we leave it alone
+        if ctype == 'text/plain':
+            # We need to choose a charset for the scrubbed message, so we'll
+            # arbitrarily pick the charset of the first text/plain part in the
+            # message.
+            if charset is None:
+                charset = part.get_content_charset(charset)
+        elif ctype == 'text/html' and isinstance(sanitize, IntType):
+            if sanitize == 0:
+                if outer:
+                    raise DiscardMessage
+                part.set_payload(_('HTML attachment scrubbed and removed'))
+                part.set_type('text/plain')
+            elif sanitize == 2:
+                # By leaving it alone, Pipermail will automatically escape it
+                pass
+            elif sanitize == 3:
+                # Pull it out as an attachment but leave it unescaped.  This
+                # is dangerous, but perhaps useful for heavily moderated
+                # lists.
+                omask = os.umask(002)
+                try:
+                    url = save_attachment(mlist, part, dir, filter_html=0)
+                finally:
+                    os.umask(omask)
+                part.set_payload(_("""\
+An HTML attachment was scrubbed...
+URL: %(url)s
+"""))
+                part.set_type('text/plain')
+            else:
+                # HTML-escape it and store it as an attachment, but make it
+                # look a /little/ bit prettier. :(
+                payload = Utils.websafe(part.get_payload(decode=1))
+                # For whitespace in the margin, change spaces into
+                # non-breaking spaces, and tabs into 8 of those.  Then use a
+                # mono-space font.  Still looks hideous to me, but then I'd
+                # just as soon discard them.
+                def doreplace(s):
+                    return s.replace(' ', '&nbsp;').replace('\t', '&nbsp'*8)
+                lines = [doreplace(s) for s in payload.split('\n')]
+                payload = '<tt>\n' + BR.join(lines) + '\n</tt>\n'
+                part.set_payload(payload)
+                # We're replacing the payload with the decoded payload so this
+                # will just get in the way.
+                del part['content-transfer-encoding']
+                omask = os.umask(002)
+                try:
+                    url = save_attachment(mlist, part, dir, filter_html=0)
+                finally:
+                    os.umask(omask)
+                part.set_payload(_("""\
+An HTML attachment was scrubbed...
+URL: %(url)s
+"""))
+                part.set_type('text/plain')
+        elif ctype == 'message/rfc822':
+            # This part contains a submessage, so it too needs scrubbing
+            submsg = part.get_payload(0)
+            omask = os.umask(002)
+            try:
+                url = save_attachment(mlist, part, dir)
+            finally:
+                os.umask(omask)
+            subject = submsg.get('subject', _('no subject'))
+            date = submsg.get('date', _('no date'))
+            who = submsg.get('from', _('unknown sender'))
+            size = len(str(submsg))
+            part.set_payload(_("""\
+An embedded message was scrubbed...
+From: %(who)s
+Subject: %(subject)s
+Date: %(date)s
+Size: %(size)s
+Url: %(url)s
+"""))
+            part.set_type('text/plain')
+        # If the message isn't a multipart, then we'll strip it out as an
+        # attachment that would have to be separately downloaded.  Pipermail
+        # will transform the url into a hyperlink.
+        elif not part.is_multipart():
+            payload = part.get_payload()
+            ctype = part.get_type()
+            size = len(payload)
+            omask = os.umask(002)
+            try:
+                url = save_attachment(mlist, part, dir)
+            finally:
+                os.umask(omask)
+            desc = part.get('content-description', _('not available'))
+            filename = part.get_filename(_('not available'))
+            part.set_payload(_("""\
+A non-text attachment was scrubbed...
+Name: %(filename)s
+Type: %(ctype)s
+Size: %(size)d bytes
+Desc: %(desc)s
+Url : %(url)s
+"""))
+            part.set_type('text/plain')
+        outer = 0
+    # We still have to sanitize multipart messages to flat text because
+    # Pipermail can't handle messages with list payloads.  This is a kludge;
+    # def (n) clever hack ;).
+    if msg.is_multipart():
+        # By default we take the charset of the first text/plain part in the
+        # message, but if there was none, we'll use the list's preferred
+        # language's charset.
+        if charset is None:
+            charset = Utils.GetCharSet(mlist.preferred_language)
+        # We now want to concatenate all the parts which have been scrubbed to
+        # text/plain, into a single text/plain payload.  We need to make sure
+        # all the characters in the concatenated string are in the same
+        # encoding, so we'll use the 'replace' key in the coercion call.
+        # BAW: Martin's original patch suggested we might want to try
+        # generalizing to utf-8, and that's probably a good idea (eventually).
+        text = []
+        for part in msg.get_payload():
+            # All parts should be scrubbed to text/plain by now.
+            partctype = part.get_content_type()
+            if partctype <> 'text/plain':
+                text.append(_('Skipped content of type %(partctype)s'))
+                continue
+            try:
+                t = part.get_payload(decode=1)
+            except binascii.Error:
+                t = part.get_payload()
+            partcharset = part.get_charset()
+            if partcharset and partcharset <> charset:
+                try:
+                    t = unicode(t, partcharset, 'replace')
+                    # Should use HTML-Escape, or try generalizing to UTF-8
+                    t = t.encode(charset, 'replace')
+                except UnicodeError:
+                    # Replace funny characters
+                    t = unicode(t, 'ascii', 'replace').encode('ascii')
+            text.append(t)
+        # Now join the text and set the payload
+        sep = _('-------------- next part --------------\n')
+        msg.set_payload(sep.join(text), charset)
+        msg.set_type('text/plain')
+        del msg['content-transfer-encoding']
+        msg.add_header('Content-Transfer-Encoding', '8bit')
+    return msg
+
+
+
+def makedirs(dir):
+    # Create all the directories to store this attachment in
+    try:
+        os.makedirs(dir, 02775)
+    except OSError, e:
+        if e.errno <> errno.EEXIST: raise
+    # Unfortunately, FreeBSD seems to be broken in that it doesn't honor the
+    # mode arg of mkdir().
+    def twiddle(arg, dirname, names):
+        os.chmod(dirname, 02775)
+    os.path.walk(dir, twiddle, None)
+
+
+
+def save_attachment(mlist, msg, dir, filter_html=1):
+    fsdir = os.path.join(mlist.archive_dir(), dir)
+    makedirs(fsdir)
+    # Figure out the attachment type and get the decoded data
+    decodedpayload = msg.get_payload(decode=1)
+    # BAW: mimetypes ought to handle non-standard, but commonly found types,
+    # e.g. image/jpg (should be image/jpeg).  For now we just store such
+    # things as application/octet-streams since that seems the safest.
+    ext = mimetypes.guess_extension(msg.get_type())
+    if not ext:
+        # We don't know what it is, so assume it's just a shapeless
+        # application/octet-stream, unless the Content-Type: is
+        # message/rfc822, in which case we know we'll coerce the type to
+        # text/plain below.
+        if msg.get_type() == 'message/rfc822':
+            ext = '.txt'
+        else:
+            ext = '.bin'
+    path = None
+    # We need a lock to calculate the next attachment number
+    lockfile = os.path.join(fsdir, 'attachments.lock')
+    lock = LockFile.LockFile(lockfile)
+    lock.lock()
+    try:
+        # Now base the filename on what's in the attachment, uniquifying it if
+        # necessary.
+        filename = msg.get_filename()
+        if not filename:
+            filebase = 'attachment'
+        else:
+            # Sanitize the filename given in the message headers
+            parts = pre.split(filename)
+            filename = parts[-1]
+            # Strip off leading dots
+            filename = dre.sub('', filename)
+            # Allow only alphanumerics, dash, underscore, and dot
+            filename = sre.sub('', filename)
+            # If the filename's extension doesn't match the type we guessed,
+            # which one should we go with?  For now, let's go with the one we
+            # guessed so attachments can't lie about their type.  Also, if the
+            # filename /has/ no extension, then tack on the one we guessed.
+            filebase, ignore = os.path.splitext(filename)
+        # Now we're looking for a unique name for this file on the file
+        # system.  If msgdir/filebase.ext isn't unique, we'll add a counter
+        # after filebase, e.g. msgdir/filebase-cnt.ext
+        counter = 0
+        extra = ''
+        while 1:
+            path = os.path.join(fsdir, filebase + extra + ext)
+            # Generally it is not a good idea to test for file existance
+            # before just trying to create it, but the alternatives aren't
+            # wonderful (i.e. os.open(..., O_CREAT | O_EXCL) isn't
+            # NFS-safe).  Besides, we have an exclusive lock now, so we're
+            # guaranteed that no other process will be racing with us.
+            if os.path.exists(path):
+                counter += 1
+                extra = '-%04d' % counter
+            else:
+                break
+    finally:
+        lock.unlock()
+    # `path' now contains the unique filename for the attachment.  There's
+    # just one more step we need to do.  If the part is text/html and
+    # ARCHIVE_HTML_SANITIZER is a string (which it must be or we wouldn't be
+    # here), then send the attachment through the filter program for
+    # sanitization
+    if filter_html and msg.get_type() == 'text/html':
+        base, ext = os.path.splitext(path)
+        tmppath = base + '-tmp' + ext
+        fp = open(tmppath, 'w')
+        try:
+            fp.write(decodedpayload)
+            fp.close()
+            cmd = mm_cfg.ARCHIVE_HTML_SANITIZER % {'filename' : tmppath}
+            progfp = os.popen(cmd, 'r')
+            decodedpayload = progfp.read()
+            status = progfp.close()
+            if status:
+                syslog('error',
+                       'HTML sanitizer exited with non-zero status: %s',
+                       status)
+        finally:
+            os.unlink(tmppath)
+        # BAW: Since we've now sanitized the document, it should be plain
+        # text.  Blarg, we really want the sanitizer to tell us what the type
+        # if the return data is. :(
+        ext = '.txt'
+        path = base + '.txt'
+    # Is it a message/rfc822 attachment?
+    elif msg.get_type() == 'message/rfc822':
+        submsg = msg.get_payload()
+        # BAW: I'm sure we can eventually do better than this. :(
+        decodedpayload = Utils.websafe(str(submsg))
+    fp = open(path, 'w')
+    fp.write(decodedpayload)
+    fp.close()
+    # Now calculate the url
+    baseurl = mlist.GetBaseArchiveURL()
+    # Private archives will likely have a trailing slash.  Normalize.
+    if baseurl[-1] <> '/':
+        baseurl += '/'
+    url = baseurl + '%s/%s%s%s' % (dir, filebase, extra, ext)
+    return url