aboutsummaryrefslogtreecommitdiffstats
path: root/Mailman/Utils.py
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--Mailman/Utils.py41
1 files changed, 21 insertions, 20 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 847d1a82..b0eb2dd0 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -808,12 +808,25 @@ def canonstr(s, lang=None):
newparts = []
parts = re.split(r'&(?P<ref>[^;]+);', s)
def appchr(i):
- if i < 256:
- newparts.append(chr(i))
+ # do everything in unicode
+ newparts.append(unichr(i))
+ def tounicode(s):
+ # We want the default fallback to be iso-8859-1 even if the language
+ # is English (us-ascii). This seems like a practical compromise so
+ # that non-ASCII characters in names can be used in English lists w/o
+ # having to change the global charset for English from us-ascii (which
+ # I superstitiously think may have unintended consequences).
+ if isinstance(s, unicode):
+ return s
+ if lang is None:
+ charset = 'iso-8859-1'
else:
- newparts.append(unichr(i))
+ charset = GetCharSet(lang)
+ if charset == 'us-ascii':
+ charset = 'iso-8859-1'
+ return unicode(s, charset, 'replace')
while True:
- newparts.append(parts.pop(0))
+ newparts.append(tounicode(parts.pop(0)))
if not parts:
break
ref = parts.pop(0)
@@ -822,28 +835,16 @@ def canonstr(s, lang=None):
appchr(int(ref[1:]))
except ValueError:
# Non-convertable, stick with what we got
- newparts.append('&'+ref+';')
+ newparts.append(tounicode('&'+ref+';'))
else:
c = htmlentitydefs.entitydefs.get(ref, '?')
if c.startswith('#') and c.endswith(';'):
appchr(int(ref[1:-1]))
else:
- newparts.append(c)
+ newparts.append(tounicode(c))
newstr = EMPTYSTRING.join(newparts)
- if isinstance(newstr, UnicodeType):
- return newstr
- # We want the default fallback to be iso-8859-1 even if the language is
- # English (us-ascii). This seems like a practical compromise so that
- # non-ASCII characters in names can be used in English lists w/o having to
- # change the global charset for English from us-ascii (which I
- # superstitiously think may have unintended consequences).
- if lang is None:
- charset = 'iso-8859-1'
- else:
- charset = GetCharSet(lang)
- if charset == 'us-ascii':
- charset = 'iso-8859-1'
- return unicode(newstr, charset, 'replace')
+ # newstr is unicode
+ return newstr
# The opposite of canonstr() -- sorta. I.e. it attempts to encode s in the