Match header_filter_rules as normalized unicodes.

author: Mark Sapiro <mark@msapiro.net> 2016-07-14 19:10:24 -0700
committer: Mark Sapiro <mark@msapiro.net> 2016-07-14 19:10:24 -0700
commit: b17234a23a590d9b27f3f609781596eea27b6974 (patch)
tree: 6d065e88b6a68a6fbc989a4b8e425769da00d293 /Mailman/Handlers
parent: 6efea059931995de8713f35bccc1116905175cf2 (diff)
download: mailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.gz
mailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.xz
mailman2-b17234a23a590d9b27f3f609781596eea27b6974.zip
1 files changed, 19 insertions, 6 deletions
diff --git a/Mailman/Handlers/SpamDetect.py b/Mailman/Handlers/SpamDetect.py
index de19adfc..aaddff5f 100644
--- a/Mailman/Handlers/SpamDetect.py
+++ b/Mailman/Handlers/SpamDetect.py
@@ -27,6 +27,7 @@ TBD: This needs to be made more configurable and robust.
 
 import re
 
+from unicodedata import normalize
 from email.Errors import HeaderParseError
 from email.Header import decode_header
 from email.Utils import parseaddr
@@ -36,6 +37,7 @@ from Mailman import Errors
 from Mailman import i18n
 from Mailman import Utils
 from Mailman.Handlers.Hold import hold_for_approval
+from Mailman.Logging.Syslog import syslog
 
 try:
     True, False
@@ -63,11 +65,11 @@ _ = i18n._
 
 
 def getDecodedHeaders(msg, cset='utf-8'):
-    """Returns a string containing all the headers of msg, unfolded and
-    RFC 2047 decoded and encoded in cset.
+    """Returns a unicode containing all the headers of msg, unfolded and
+    RFC 2047 decoded, normalized and separated by new lines.
     """
 
-    headers = ''
+    headers = u''
     for h, v in msg.items():
         uvalue = u''
         try:
@@ -86,7 +88,8 @@ def getDecodedHeaders(msg, cset='utf-8'):
                 # unicode it as iso-8859-1 which may result in a garbled
                 # mess, but we have to do something.
                 uvalue += unicode(frag, 'iso-8859-1', 'replace')
-        headers += '%s: %s\n' % (h, uvalue.encode(cset, 'xmlcharrefreplace'))
+        uhdr = h.decode('us-ascii', 'replace')
+        headers += u'%s: %s\n' % (h, normalize(mm_cfg.NORMALIZE_FORM, uvalue))
     return headers
 
 
@@ -150,7 +153,7 @@ error, contact the mailing list owner at %(listowner)s."""))
     # Now do header_filter_rules
     # TK: Collect headers in sub-parts because attachment filename
     # extension may be a clue to possible virus/spam.
-    headers = ''
+    headers = u''
     # Get the character set of the lists preferred language for headers
     lcset = Utils.GetCharSet(mlist.preferred_language)
     for p in msg.walk():
@@ -164,7 +167,17 @@ error, contact the mailing list owner at %(listowner)s."""))
             # ignore 'empty' patterns
             if not pattern.strip():
                 continue
-            if re.search(pattern, headers, re.IGNORECASE|re.MULTILINE):
+            pattern = Utils.xml_to_unicode(pattern, lcset)
+            pattern = normalize(mm_cfg.NORMALIZE_FORM, pattern)
+            try:
+                mo = re.search(pattern,
+                               headers,
+                               re.IGNORECASE|re.MULTILINE|re.UNICODE)
+            except (re.error, TypeError):
+                syslog('error',
+                       'ignoring header_filter_rules invalid pattern: %s',
+                       pattern)
+            if mo:
                 if action == mm_cfg.DISCARD:
                     raise Errors.DiscardMessage
                 if action == mm_cfg.REJECT:
author	Mark Sapiro <mark@msapiro.net>	2016-07-14 19:10:24 -0700
committer	Mark Sapiro <mark@msapiro.net>	2016-07-14 19:10:24 -0700
commit	b17234a23a590d9b27f3f609781596eea27b6974 (patch)
tree	6d065e88b6a68a6fbc989a4b8e425769da00d293 /Mailman/Handlers
parent	6efea059931995de8713f35bccc1116905175cf2 (diff)
download	mailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.gz mailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.xz mailman2-b17234a23a590d9b27f3f609781596eea27b6974.zip