17 files changed, 233 insertions, 32 deletions
diff --git a/Mailman/Cgi/admin.py b/Mailman/Cgi/admin.py
index a939c88a..9ae661a8 100644
--- a/Mailman/Cgi/admin.py
+++ b/Mailman/Cgi/admin.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2015 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -86,6 +86,18 @@ def main():
     i18n.set_language(mlist.preferred_language)
     # If the user is not authenticated, we're done.
     cgidata = cgi.FieldStorage(keep_blank_values=1)
+    try:
+        cgidata.getvalue('csrf_token', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc = Document()
+        doc.set_language(mm_cfg.DEFAULT_SERVER_LANGUAGE)
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
 
     # CSRF check
     safe_params = ['VARHELP', 'adminpw', 'admlogin',
diff --git a/Mailman/Cgi/admindb.py b/Mailman/Cgi/admindb.py
index fb2c7e18..1e9fad0f 100644
--- a/Mailman/Cgi/admindb.py
+++ b/Mailman/Cgi/admindb.py
@@ -122,6 +122,18 @@ def main():
 
     # Make sure the user is authorized to see this page.
     cgidata = cgi.FieldStorage(keep_blank_values=1)
+    try:
+        cgidata.getvalue('adminpw', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc = Document()
+        doc.set_language(mm_cfg.DEFAULT_SERVER_LANGUAGE)
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
 
     if not mlist.WebAuthenticate((mm_cfg.AuthListAdmin,
                                   mm_cfg.AuthListModerator,
diff --git a/Mailman/Cgi/confirm.py b/Mailman/Cgi/confirm.py
index 97297e10..fec69dd2 100644
--- a/Mailman/Cgi/confirm.py
+++ b/Mailman/Cgi/confirm.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2015 by the Free Software Foundation, Inc.
+# Copyright (C) 2001-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -73,7 +73,17 @@ def main():
 
     # Get the form data to see if this is a second-step confirmation
     cgidata = cgi.FieldStorage(keep_blank_values=1)
-    cookie = cgidata.getvalue('cookie')
+    try:
+        cookie = cgidata.getvalue('cookie')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
+
     if cookie == '':
         ask_for_cookie(mlist, doc, _('Confirmation string was empty.'))
         return
diff --git a/Mailman/Cgi/create.py b/Mailman/Cgi/create.py
index dd862208..3c2a7dc4 100644
--- a/Mailman/Cgi/create.py
+++ b/Mailman/Cgi/create.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2010 by the Free Software Foundation, Inc.
+# Copyright (C) 2001-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -43,6 +43,17 @@ def main():
     doc.set_language(mm_cfg.DEFAULT_SERVER_LANGUAGE)
 
     cgidata = cgi.FieldStorage()
+    try:
+        cgidata.getvalue('doit', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
+
     parts = Utils.GetPathPieces()
     if parts:
         # Bad URL specification
diff --git a/Mailman/Cgi/edithtml.py b/Mailman/Cgi/edithtml.py
index 85632531..6eb65d6a 100644
--- a/Mailman/Cgi/edithtml.py
+++ b/Mailman/Cgi/edithtml.py
@@ -93,6 +93,16 @@ def main():
 
     # Must be authenticated to get any farther
     cgidata = cgi.FieldStorage()
+    try:
+        cgidata.getvalue('adminpw', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
 
     # Editing the html for a list is limited to the list admin and site admin.
     if not mlist.WebAuthenticate((mm_cfg.AuthListAdmin,
diff --git a/Mailman/Cgi/listinfo.py b/Mailman/Cgi/listinfo.py
index b07e2201..340f0fc1 100644
--- a/Mailman/Cgi/listinfo.py
+++ b/Mailman/Cgi/listinfo.py
@@ -58,7 +58,19 @@ def main():
 
     # See if the user want to see this page in other language
     cgidata = cgi.FieldStorage()
-    language = cgidata.getvalue('language')
+    try:
+        language = cgidata.getvalue('language')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc = Document()
+        doc.set_language(mm_cfg.DEFAULT_SERVER_LANGUAGE)
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
+
     if not Utils.IsLanguage(language):
         language = mlist.preferred_language
     i18n.set_language(language)
diff --git a/Mailman/Cgi/options.py b/Mailman/Cgi/options.py
index cdc2bef3..38b34fd1 100644
--- a/Mailman/Cgi/options.py
+++ b/Mailman/Cgi/options.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2015 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -108,7 +108,17 @@ def main():
     # we might have a 'language' key in the cgi data.  That was an explicit
     # preference to view the page in, so we should honor that here.  If that's
     # not available, use the list's default language.
-    language = cgidata.getvalue('language')
+    try:
+        language = cgidata.getvalue('language')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
+
     if not Utils.IsLanguage(language):
         language = mlist.preferred_language
     i18n.set_language(language)
diff --git a/Mailman/Cgi/private.py b/Mailman/Cgi/private.py
index 36cacee4..0f7597a2 100755
--- a/Mailman/Cgi/private.py
+++ b/Mailman/Cgi/private.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2014 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -118,7 +118,16 @@ def main():
     doc.set_language(mlist.preferred_language)
 
     cgidata = cgi.FieldStorage()
-    username = cgidata.getvalue('username', '')
+    try:
+        username = cgidata.getvalue('username', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
     password = cgidata.getvalue('password', '')
 
     is_auth = 0
diff --git a/Mailman/Cgi/rmlist.py b/Mailman/Cgi/rmlist.py
index db588121..3149700d 100644
--- a/Mailman/Cgi/rmlist.py
+++ b/Mailman/Cgi/rmlist.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2001-2014 by the Free Software Foundation, Inc.
+# Copyright (C) 2001-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -41,6 +41,17 @@ def main():
     doc.set_language(mm_cfg.DEFAULT_SERVER_LANGUAGE)
 
     cgidata = cgi.FieldStorage()
+    try:
+        cgidata.getvalue('password', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
+
     parts = Utils.GetPathPieces()
 
     if not parts:
diff --git a/Mailman/Cgi/roster.py b/Mailman/Cgi/roster.py
index 6c64925b..e9ab03c1 100644
--- a/Mailman/Cgi/roster.py
+++ b/Mailman/Cgi/roster.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2014 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -63,7 +63,19 @@ def main():
     cgidata = cgi.FieldStorage()
 
     # messages in form should go in selected language (if any...)
-    lang = cgidata.getvalue('language')
+    try:
+        lang = cgidata.getvalue('language')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc = Document()
+        doc.set_language(mm_cfg.DEFAULT_SERVER_LANGUAGE)
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
+
     if not Utils.IsLanguage(lang):
         lang = mlist.preferred_language
     i18n.set_language(lang)
diff --git a/Mailman/Cgi/subscribe.py b/Mailman/Cgi/subscribe.py
index ab5c7cd8..36d25fa2 100755
--- a/Mailman/Cgi/subscribe.py
+++ b/Mailman/Cgi/subscribe.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2015 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -70,7 +70,16 @@ def main():
     # See if the form data has a preferred language set, in which case, use it
     # for the results.  If not, use the list's preferred language.
     cgidata = cgi.FieldStorage()
-    language = cgidata.getvalue('language')
+    try:
+        language = cgidata.getvalue('language', '')
+    except TypeError:
+        # Someone crafted a POST with a bad Content-Type:.
+        doc.AddItem(Header(2, _("Error")))
+        doc.AddItem(Bold(_('Invalid options to CGI script.')))
+        # Send this with a 400 status.
+        print 'Status: 400 Bad Request'
+        print doc.Format()
+        return
     if not Utils.IsLanguage(language):
         language = mlist.preferred_language
     i18n.set_language(language)
diff --git a/Mailman/Defaults.py.in b/Mailman/Defaults.py.in
index 4ae5633f..9ecdbe62 100755
--- a/Mailman/Defaults.py.in
+++ b/Mailman/Defaults.py.in
@@ -218,6 +218,13 @@ add_virtualhost(DEFAULT_URL_HOST, DEFAULT_EMAIL_HOST)
 # -owners address, unless the message is explicitly approved.
 KNOWN_SPAMMERS = []
 
+# The header_filter_rules in Privacy options... -> Spam filters are matched as
+# normalized unicodes against normalized unicode headers.  This setting
+# determines the normalization form.  It is one of 'NFC', 'NFD', 'NFKC' or
+# 'NFKD'.  See
+# https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize
+NORMALIZE_FORM = 'NFKC'
+
 
 
 #####
diff --git a/Mailman/Gui/Privacy.py b/Mailman/Gui/Privacy.py
index e500908d..2a9cca26 100644
--- a/Mailman/Gui/Privacy.py
+++ b/Mailman/Gui/Privacy.py
@@ -17,6 +17,7 @@
 
 """MailList mixin class managing the privacy options."""
 
+import os
 import re
 
 from Mailman import mm_cfg
@@ -658,9 +659,20 @@ class Privacy(GUIBase):
                 doc.addError(_("""Header filter rules require a pattern.
                 Incomplete filter rules will be ignored."""))
                 continue
-            # Make sure the pattern was a legal regular expression
+            # Make sure the pattern was a legal regular expression.
+            # Convert it to unicode if necessary.
+            mo = re.match('.*charset=([-_a-z0-9]+)',
+                          os.environ.get('CONTENT_TYPE', ''),
+                          re.IGNORECASE
+                         )
+            if mo:
+                cset = mo.group(1)
+            else:
+                cset = Utils.GetCharSet(mlist.preferred_language)
             try:
-                re.compile(pattern)
+                upattern = Utils.xml_to_unicode(pattern, cset)
+                re.compile(upattern)
+                pattern = upattern
             except (re.error, TypeError):
                 safepattern = Utils.websafe(pattern)
                 doc.addError(_("""The header filter rule pattern
diff --git a/Mailman/Handlers/SpamDetect.py b/Mailman/Handlers/SpamDetect.py
index de19adfc..aaddff5f 100644
--- a/Mailman/Handlers/SpamDetect.py
+++ b/Mailman/Handlers/SpamDetect.py
@@ -27,6 +27,7 @@ TBD: This needs to be made more configurable and robust.
 
 import re
 
+from unicodedata import normalize
 from email.Errors import HeaderParseError
 from email.Header import decode_header
 from email.Utils import parseaddr
@@ -36,6 +37,7 @@ from Mailman import Errors
 from Mailman import i18n
 from Mailman import Utils
 from Mailman.Handlers.Hold import hold_for_approval
+from Mailman.Logging.Syslog import syslog
 
 try:
     True, False
@@ -63,11 +65,11 @@ _ = i18n._
 
 
 def getDecodedHeaders(msg, cset='utf-8'):
-    """Returns a string containing all the headers of msg, unfolded and
-    RFC 2047 decoded and encoded in cset.
+    """Returns a unicode containing all the headers of msg, unfolded and
+    RFC 2047 decoded, normalized and separated by new lines.
     """
 
-    headers = ''
+    headers = u''
     for h, v in msg.items():
         uvalue = u''
         try:
@@ -86,7 +88,8 @@ def getDecodedHeaders(msg, cset='utf-8'):
                 # unicode it as iso-8859-1 which may result in a garbled
                 # mess, but we have to do something.
                 uvalue += unicode(frag, 'iso-8859-1', 'replace')
-        headers += '%s: %s\n' % (h, uvalue.encode(cset, 'xmlcharrefreplace'))
+        uhdr = h.decode('us-ascii', 'replace')
+        headers += u'%s: %s\n' % (h, normalize(mm_cfg.NORMALIZE_FORM, uvalue))
     return headers
 
 
@@ -150,7 +153,7 @@ error, contact the mailing list owner at %(listowner)s."""))
     # Now do header_filter_rules
     # TK: Collect headers in sub-parts because attachment filename
     # extension may be a clue to possible virus/spam.
-    headers = ''
+    headers = u''
     # Get the character set of the lists preferred language for headers
     lcset = Utils.GetCharSet(mlist.preferred_language)
     for p in msg.walk():
@@ -164,7 +167,17 @@ error, contact the mailing list owner at %(listowner)s."""))
             # ignore 'empty' patterns
             if not pattern.strip():
                 continue
-            if re.search(pattern, headers, re.IGNORECASE|re.MULTILINE):
+            pattern = Utils.xml_to_unicode(pattern, lcset)
+            pattern = normalize(mm_cfg.NORMALIZE_FORM, pattern)
+            try:
+                mo = re.search(pattern,
+                               headers,
+                               re.IGNORECASE|re.MULTILINE|re.UNICODE)
+            except (re.error, TypeError):
+                syslog('error',
+                       'ignoring header_filter_rules invalid pattern: %s',
+                       pattern)
+            if mo:
                 if action == mm_cfg.DISCARD:
                     raise Errors.DiscardMessage
                 if action == mm_cfg.REJECT:
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 2dbaef0b..d2317b10 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -1432,3 +1432,34 @@ def check_eq_domains(email, domains_list):
             return [local + '@' + x for x in domains if x != domain]
     return []
 
+
+def _invert_xml(mo):
+    # This is used with re.sub below to convert XML char refs and textual \u
+    # escapes to unicodes.
+    try:
+        if mo.group(1)[:1] == '#':
+            return unichr(int(mo.group(1)[1:]))
+        elif mo.group(1)[:1].lower() == 'u':
+            return unichr(int(mo.group(1)[1:], 16))
+        else:
+            return(u'\ufffd')
+    except ValueError:
+        # Value is out of range.  Return the unicode replace character.
+        return(u'\ufffd')
+
+
+def xml_to_unicode(s, cset):
+    """This converts a string s, encoded in cset to a unicode with translation
+    of XML character references and textual \uxxxx escapes.  It is more or less
+    the inverse of unicode.decode(cset, errors='xmlcharrefreplace').  It is
+    similar to canonstr above except for replacing invalid refs with the
+    unicode replace character and recognizing \u escapes.
+    """
+    if isinstance(s, str):
+        us = s.decode(cset, 'replace')
+        us = re.sub(u'&(#[0-9]+);', _invert_xml, us)
+        us = re.sub(u'(?i)\\\\(u[a-f0-9]{4})', _invert_xml, us)
+        return us
+    else:
+        return s
+
diff --git a/Mailman/htmlformat.py b/Mailman/htmlformat.py
index 419fa296..2770eb60 100755
--- a/Mailman/htmlformat.py
+++ b/Mailman/htmlformat.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2015 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -453,7 +453,7 @@ class InputObj:
         output.append('>')
         ret = SPACE.join(output)
         if self.type == 'TEXT' and isinstance(ret, unicode):
-            ret = ret.encode(charset, 'replace')
+            ret = ret.encode(charset, 'xmlcharrefreplace')
         return ret
 
 
@@ -504,7 +504,7 @@ class TextArea:
             output += ' READONLY'
         output += '>%s</TEXTAREA>' % self.text
         if isinstance(output, unicode):
-            output = output.encode(charset, 'replace')
+            output = output.encode(charset, 'xmlcharrefreplace')
         return output
 
 class FileUpload(InputObj):
diff --git a/NEWS b/NEWS
index 3ae3220f..2ca87cac 100644
--- a/NEWS
+++ b/NEWS
@@ -9,13 +9,30 @@ Here is a history of user visible changes to Mailman.
  
   New Features
 
-    - RFC 2047 encoded headers are now decoded and re-encoded in the charset of
-      the list's preferred language for matching by header_filter_rules using
-      errors='xmlcharrefreplace' instead of the former errors='replace'.  This
-      means that characters that can't be represented in the charset of the
-      list's preferred language will now be represented as '&#nnnn;' XML
-      character references rather than '?' enabling regexps to be constructed
-      to match specific characters or ranges.  (LP: #558155)
+    - For header_filter_rules matching, both RFC 2047 encoded headers and
+      header_filter_rules patterns are now decoded to unicode as are.  Both
+      XML character references of the form &#nnnn; and unicode escapes of the
+      form \Uxxxx in patterns are converted to unicodes as well.  Both headers
+      and patterns are normalized to 'NFKC' normal form before matching, but
+      the normalization form can be set via a new NORMALIZE_FORM mm_cfg
+      setting.  Also, the web UI has been updated to encode characters in text
+      fields that are invalid in the character set of the page's language as
+      XML character references instead of '?'.  This should help with entering
+      header_filter_rules patterns to match 'odd' characters.  This feature is
+      experimental and is problematic for some cases where it is desired to
+      have a header_filter_rules pattern with characters not in the character
+      set of the list's preferred language.  For patterns without such
+      characters, the only change in behavior should be because of unicode
+      normalization which should improve matching.  For other situations such
+      as trying to match a Subject: with CJK characters (range U+4E00..U+9FFF)
+      on an English language (ascii) list, one can enter a pattern like
+      '^subject:.*[&#19968;-&#40959;]' or '^subject:.*[\u4e00;-\u9fff;]' to
+      match a Subject with any character in the range, and it will work, but
+      depending on the actual characters and the browser, submitting another,
+      even unrelated change can garble the original entry although this
+      usually occurs only with ascii pages and characters in the range
+      \u0080-\u00ff.  The \Uxxxx unicode escapes must have exactly 4 hex
+      digits, but they are case insensitive.  (LP: #558155)
 
     - Thanks to Jim Popovitch REMOVE_DKIM_HEADERS can now be set to 3 to
       preserve the original headers as X-Mailman-Original-... before removing
@@ -48,6 +65,9 @@ Here is a history of user visible changes to Mailman.
 
   Bug fixes and other patches
 
+    - We no longer throw an uncaught TypeError with certain defective crafted
+      POST requests to Mailman's CGIs.  (LP: #1602608)
+
     - Scrubber links in archives are now in the list's preferred_language
       rather than the poster's language.  (LP: #1586505)