aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Sapiro <mark@msapiro.net>2016-07-14 19:10:24 -0700
committerMark Sapiro <mark@msapiro.net>2016-07-14 19:10:24 -0700
commitb17234a23a590d9b27f3f609781596eea27b6974 (patch)
tree6d065e88b6a68a6fbc989a4b8e425769da00d293
parent6efea059931995de8713f35bccc1116905175cf2 (diff)
downloadmailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.gz
mailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.xz
mailman2-b17234a23a590d9b27f3f609781596eea27b6974.zip
Match header_filter_rules as normalized unicodes.
-rwxr-xr-xMailman/Defaults.py.in7
-rw-r--r--Mailman/Gui/Privacy.py16
-rw-r--r--Mailman/Handlers/SpamDetect.py25
-rw-r--r--Mailman/Utils.py31
-rwxr-xr-xMailman/htmlformat.py6
-rw-r--r--NEWS31
6 files changed, 98 insertions, 18 deletions
diff --git a/Mailman/Defaults.py.in b/Mailman/Defaults.py.in
index 4ae5633f..9ecdbe62 100755
--- a/Mailman/Defaults.py.in
+++ b/Mailman/Defaults.py.in
@@ -218,6 +218,13 @@ add_virtualhost(DEFAULT_URL_HOST, DEFAULT_EMAIL_HOST)
# -owners address, unless the message is explicitly approved.
KNOWN_SPAMMERS = []
+# The header_filter_rules in Privacy options... -> Spam filters are matched as
+# normalized unicodes against normalized unicode headers. This setting
+# determines the normalization form. It is one of 'NFC', 'NFD', 'NFKC' or
+# 'NFKD'. See
+# https://docs.python.org/2/library/unicodedata.html#unicodedata.normalize
+NORMALIZE_FORM = 'NFKC'
+
#####
diff --git a/Mailman/Gui/Privacy.py b/Mailman/Gui/Privacy.py
index e500908d..2a9cca26 100644
--- a/Mailman/Gui/Privacy.py
+++ b/Mailman/Gui/Privacy.py
@@ -17,6 +17,7 @@
"""MailList mixin class managing the privacy options."""
+import os
import re
from Mailman import mm_cfg
@@ -658,9 +659,20 @@ class Privacy(GUIBase):
doc.addError(_("""Header filter rules require a pattern.
Incomplete filter rules will be ignored."""))
continue
- # Make sure the pattern was a legal regular expression
+ # Make sure the pattern was a legal regular expression.
+ # Convert it to unicode if necessary.
+ mo = re.match('.*charset=([-_a-z0-9]+)',
+ os.environ.get('CONTENT_TYPE', ''),
+ re.IGNORECASE
+ )
+ if mo:
+ cset = mo.group(1)
+ else:
+ cset = Utils.GetCharSet(mlist.preferred_language)
try:
- re.compile(pattern)
+ upattern = Utils.xml_to_unicode(pattern, cset)
+ re.compile(upattern)
+ pattern = upattern
except (re.error, TypeError):
safepattern = Utils.websafe(pattern)
doc.addError(_("""The header filter rule pattern
diff --git a/Mailman/Handlers/SpamDetect.py b/Mailman/Handlers/SpamDetect.py
index de19adfc..aaddff5f 100644
--- a/Mailman/Handlers/SpamDetect.py
+++ b/Mailman/Handlers/SpamDetect.py
@@ -27,6 +27,7 @@ TBD: This needs to be made more configurable and robust.
import re
+from unicodedata import normalize
from email.Errors import HeaderParseError
from email.Header import decode_header
from email.Utils import parseaddr
@@ -36,6 +37,7 @@ from Mailman import Errors
from Mailman import i18n
from Mailman import Utils
from Mailman.Handlers.Hold import hold_for_approval
+from Mailman.Logging.Syslog import syslog
try:
True, False
@@ -63,11 +65,11 @@ _ = i18n._
def getDecodedHeaders(msg, cset='utf-8'):
- """Returns a string containing all the headers of msg, unfolded and
- RFC 2047 decoded and encoded in cset.
+ """Returns a unicode containing all the headers of msg, unfolded and
+ RFC 2047 decoded, normalized and separated by new lines.
"""
- headers = ''
+ headers = u''
for h, v in msg.items():
uvalue = u''
try:
@@ -86,7 +88,8 @@ def getDecodedHeaders(msg, cset='utf-8'):
# unicode it as iso-8859-1 which may result in a garbled
# mess, but we have to do something.
uvalue += unicode(frag, 'iso-8859-1', 'replace')
- headers += '%s: %s\n' % (h, uvalue.encode(cset, 'xmlcharrefreplace'))
+ uhdr = h.decode('us-ascii', 'replace')
+ headers += u'%s: %s\n' % (h, normalize(mm_cfg.NORMALIZE_FORM, uvalue))
return headers
@@ -150,7 +153,7 @@ error, contact the mailing list owner at %(listowner)s."""))
# Now do header_filter_rules
# TK: Collect headers in sub-parts because attachment filename
# extension may be a clue to possible virus/spam.
- headers = ''
+ headers = u''
# Get the character set of the lists preferred language for headers
lcset = Utils.GetCharSet(mlist.preferred_language)
for p in msg.walk():
@@ -164,7 +167,17 @@ error, contact the mailing list owner at %(listowner)s."""))
# ignore 'empty' patterns
if not pattern.strip():
continue
- if re.search(pattern, headers, re.IGNORECASE|re.MULTILINE):
+ pattern = Utils.xml_to_unicode(pattern, lcset)
+ pattern = normalize(mm_cfg.NORMALIZE_FORM, pattern)
+ try:
+ mo = re.search(pattern,
+ headers,
+ re.IGNORECASE|re.MULTILINE|re.UNICODE)
+ except (re.error, TypeError):
+ syslog('error',
+ 'ignoring header_filter_rules invalid pattern: %s',
+ pattern)
+ if mo:
if action == mm_cfg.DISCARD:
raise Errors.DiscardMessage
if action == mm_cfg.REJECT:
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 2dbaef0b..d2317b10 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -1432,3 +1432,34 @@ def check_eq_domains(email, domains_list):
return [local + '@' + x for x in domains if x != domain]
return []
+
+def _invert_xml(mo):
+ # This is used with re.sub below to convert XML char refs and textual \u
+ # escapes to unicodes.
+ try:
+ if mo.group(1)[:1] == '#':
+ return unichr(int(mo.group(1)[1:]))
+ elif mo.group(1)[:1].lower() == 'u':
+ return unichr(int(mo.group(1)[1:], 16))
+ else:
+ return(u'\ufffd')
+ except ValueError:
+ # Value is out of range. Return the unicode replace character.
+ return(u'\ufffd')
+
+
+def xml_to_unicode(s, cset):
+ """This converts a string s, encoded in cset to a unicode with translation
+ of XML character references and textual \uxxxx escapes. It is more or less
+ the inverse of unicode.decode(cset, errors='xmlcharrefreplace'). It is
+ similar to canonstr above except for replacing invalid refs with the
+ unicode replace character and recognizing \u escapes.
+ """
+ if isinstance(s, str):
+ us = s.decode(cset, 'replace')
+ us = re.sub(u'&(#[0-9]+);', _invert_xml, us)
+ us = re.sub(u'(?i)\\\\(u[a-f0-9]{4})', _invert_xml, us)
+ return us
+ else:
+ return s
+
diff --git a/Mailman/htmlformat.py b/Mailman/htmlformat.py
index 419fa296..2770eb60 100755
--- a/Mailman/htmlformat.py
+++ b/Mailman/htmlformat.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2015 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2016 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -453,7 +453,7 @@ class InputObj:
output.append('>')
ret = SPACE.join(output)
if self.type == 'TEXT' and isinstance(ret, unicode):
- ret = ret.encode(charset, 'replace')
+ ret = ret.encode(charset, 'xmlcharrefreplace')
return ret
@@ -504,7 +504,7 @@ class TextArea:
output += ' READONLY'
output += '>%s</TEXTAREA>' % self.text
if isinstance(output, unicode):
- output = output.encode(charset, 'replace')
+ output = output.encode(charset, 'xmlcharrefreplace')
return output
class FileUpload(InputObj):
diff --git a/NEWS b/NEWS
index 7f85fa34..2ca87cac 100644
--- a/NEWS
+++ b/NEWS
@@ -9,13 +9,30 @@ Here is a history of user visible changes to Mailman.
New Features
- - RFC 2047 encoded headers are now decoded and re-encoded in the charset of
- the list's preferred language for matching by header_filter_rules using
- errors='xmlcharrefreplace' instead of the former errors='replace'. This
- means that characters that can't be represented in the charset of the
- list's preferred language will now be represented as '&#nnnn;' XML
- character references rather than '?' enabling regexps to be constructed
- to match specific characters or ranges. (LP: #558155)
+ - For header_filter_rules matching, both RFC 2047 encoded headers and
+ header_filter_rules patterns are now decoded to unicode as are. Both
+ XML character references of the form &#nnnn; and unicode escapes of the
+ form \Uxxxx in patterns are converted to unicodes as well. Both headers
+ and patterns are normalized to 'NFKC' normal form before matching, but
+ the normalization form can be set via a new NORMALIZE_FORM mm_cfg
+ setting. Also, the web UI has been updated to encode characters in text
+ fields that are invalid in the character set of the page's language as
+ XML character references instead of '?'. This should help with entering
+ header_filter_rules patterns to match 'odd' characters. This feature is
+ experimental and is problematic for some cases where it is desired to
+ have a header_filter_rules pattern with characters not in the character
+ set of the list's preferred language. For patterns without such
+ characters, the only change in behavior should be because of unicode
+ normalization which should improve matching. For other situations such
+ as trying to match a Subject: with CJK characters (range U+4E00..U+9FFF)
+ on an English language (ascii) list, one can enter a pattern like
+ '^subject:.*[&#19968;-&#40959;]' or '^subject:.*[\u4e00;-\u9fff;]' to
+ match a Subject with any character in the range, and it will work, but
+ depending on the actual characters and the browser, submitting another,
+ even unrelated change can garble the original entry although this
+ usually occurs only with ascii pages and characters in the range
+ \u0080-\u00ff. The \Uxxxx unicode escapes must have exactly 4 hex
+ digits, but they are case insensitive. (LP: #558155)
- Thanks to Jim Popovitch REMOVE_DKIM_HEADERS can now be set to 3 to
preserve the original headers as X-Mailman-Original-... before removing