diff options
Diffstat (limited to '')
-rw-r--r-- | Mailman/Utils.py | 41 |
1 files changed, 21 insertions, 20 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py index 847d1a82..b0eb2dd0 100644 --- a/Mailman/Utils.py +++ b/Mailman/Utils.py @@ -808,12 +808,25 @@ def canonstr(s, lang=None): newparts = [] parts = re.split(r'&(?P<ref>[^;]+);', s) def appchr(i): - if i < 256: - newparts.append(chr(i)) + # do everything in unicode + newparts.append(unichr(i)) + def tounicode(s): + # We want the default fallback to be iso-8859-1 even if the language + # is English (us-ascii). This seems like a practical compromise so + # that non-ASCII characters in names can be used in English lists w/o + # having to change the global charset for English from us-ascii (which + # I superstitiously think may have unintended consequences). + if isinstance(s, unicode): + return s + if lang is None: + charset = 'iso-8859-1' else: - newparts.append(unichr(i)) + charset = GetCharSet(lang) + if charset == 'us-ascii': + charset = 'iso-8859-1' + return unicode(s, charset, 'replace') while True: - newparts.append(parts.pop(0)) + newparts.append(tounicode(parts.pop(0))) if not parts: break ref = parts.pop(0) @@ -822,28 +835,16 @@ def canonstr(s, lang=None): appchr(int(ref[1:])) except ValueError: # Non-convertable, stick with what we got - newparts.append('&'+ref+';') + newparts.append(tounicode('&'+ref+';')) else: c = htmlentitydefs.entitydefs.get(ref, '?') if c.startswith('#') and c.endswith(';'): appchr(int(ref[1:-1])) else: - newparts.append(c) + newparts.append(tounicode(c)) newstr = EMPTYSTRING.join(newparts) - if isinstance(newstr, UnicodeType): - return newstr - # We want the default fallback to be iso-8859-1 even if the language is - # English (us-ascii). This seems like a practical compromise so that - # non-ASCII characters in names can be used in English lists w/o having to - # change the global charset for English from us-ascii (which I - # superstitiously think may have unintended consequences). - if lang is None: - charset = 'iso-8859-1' - else: - charset = GetCharSet(lang) - if charset == 'us-ascii': - charset = 'iso-8859-1' - return unicode(newstr, charset, 'replace') + # newstr is unicode + return newstr # The opposite of canonstr() -- sorta. I.e. it attempts to encode s in the |