aboutsummaryrefslogtreecommitdiffstats
path: root/Mailman/Utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'Mailman/Utils.py')
-rw-r--r--Mailman/Utils.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 2dbaef0b..d2317b10 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -1432,3 +1432,34 @@ def check_eq_domains(email, domains_list):
return [local + '@' + x for x in domains if x != domain]
return []
+
+def _invert_xml(mo):
+ # This is used with re.sub below to convert XML char refs and textual \u
+ # escapes to unicodes.
+ try:
+ if mo.group(1)[:1] == '#':
+ return unichr(int(mo.group(1)[1:]))
+ elif mo.group(1)[:1].lower() == 'u':
+ return unichr(int(mo.group(1)[1:], 16))
+ else:
+ return(u'\ufffd')
+ except ValueError:
+ # Value is out of range. Return the unicode replace character.
+ return(u'\ufffd')
+
+
+def xml_to_unicode(s, cset):
+ """This converts a string s, encoded in cset to a unicode with translation
+ of XML character references and textual \uxxxx escapes. It is more or less
+ the inverse of unicode.decode(cset, errors='xmlcharrefreplace'). It is
+ similar to canonstr above except for replacing invalid refs with the
+ unicode replace character and recognizing \u escapes.
+ """
+ if isinstance(s, str):
+ us = s.decode(cset, 'replace')
+ us = re.sub(u'&(#[0-9]+);', _invert_xml, us)
+ us = re.sub(u'(?i)\\\\(u[a-f0-9]{4})', _invert_xml, us)
+ return us
+ else:
+ return s
+