aboutsummaryrefslogtreecommitdiffstats
path: root/Mailman/Utils.py
diff options
context:
space:
mode:
authorMark Sapiro <mark@msapiro.net>2016-07-14 19:10:24 -0700
committerMark Sapiro <mark@msapiro.net>2016-07-14 19:10:24 -0700
commitb17234a23a590d9b27f3f609781596eea27b6974 (patch)
tree6d065e88b6a68a6fbc989a4b8e425769da00d293 /Mailman/Utils.py
parent6efea059931995de8713f35bccc1116905175cf2 (diff)
downloadmailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.gz
mailman2-b17234a23a590d9b27f3f609781596eea27b6974.tar.xz
mailman2-b17234a23a590d9b27f3f609781596eea27b6974.zip
Match header_filter_rules as normalized unicodes.
Diffstat (limited to '')
-rw-r--r--Mailman/Utils.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 2dbaef0b..d2317b10 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -1432,3 +1432,34 @@ def check_eq_domains(email, domains_list):
return [local + '@' + x for x in domains if x != domain]
return []
+
+def _invert_xml(mo):
+ # This is used with re.sub below to convert XML char refs and textual \u
+ # escapes to unicodes.
+ try:
+ if mo.group(1)[:1] == '#':
+ return unichr(int(mo.group(1)[1:]))
+ elif mo.group(1)[:1].lower() == 'u':
+ return unichr(int(mo.group(1)[1:], 16))
+ else:
+ return(u'\ufffd')
+ except ValueError:
+ # Value is out of range. Return the unicode replace character.
+ return(u'\ufffd')
+
+
+def xml_to_unicode(s, cset):
+ """This converts a string s, encoded in cset to a unicode with translation
+ of XML character references and textual \uxxxx escapes. It is more or less
+ the inverse of unicode.decode(cset, errors='xmlcharrefreplace'). It is
+ similar to canonstr above except for replacing invalid refs with the
+ unicode replace character and recognizing \u escapes.
+ """
+ if isinstance(s, str):
+ us = s.decode(cset, 'replace')
+ us = re.sub(u'&(#[0-9]+);', _invert_xml, us)
+ us = re.sub(u'(?i)\\\\(u[a-f0-9]{4})', _invert_xml, us)
+ return us
+ else:
+ return s
+