diff options
Diffstat (limited to 'Mailman/Handlers/CookHeaders.py')
-rw-r--r-- | Mailman/Handlers/CookHeaders.py | 169 |
1 files changed, 117 insertions, 52 deletions
diff --git a/Mailman/Handlers/CookHeaders.py b/Mailman/Handlers/CookHeaders.py index fa471166..c49c3175 100644 --- a/Mailman/Handlers/CookHeaders.py +++ b/Mailman/Handlers/CookHeaders.py @@ -1,4 +1,4 @@ -# Copyright (C) 1998-2003 by the Free Software Foundation, Inc. +# Copyright (C) 1998-2004 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License @@ -22,8 +22,9 @@ import re from types import UnicodeType from email.Charset import Charset -from email.Header import Header, decode_header +from email.Header import Header, decode_header, make_header from email.Utils import parseaddr, formataddr, getaddresses +from email.Errors import HeaderParseError from Mailman import mm_cfg from Mailman import Utils @@ -39,21 +40,21 @@ MAXLINELEN = 78 def _isunicode(s): return isinstance(s, UnicodeType) +nonascii = re.compile('[^\s!-~]') + def uheader(mlist, s, header_name=None, continuation_ws='\t', maxlinelen=None): - # Get the charset to encode the string in. If this is us-ascii, we'll use - # iso-8859-1 instead, just to get a little extra coverage, and because the - # Header class tries us-ascii first anyway. + # Get the charset to encode the string in. Then search if there is any + # non-ascii character is in the string. If there is and the charset is + # us-ascii then we use iso-8859-1 instead. If the string is ascii only + # we use 'us-ascii' if another charset is specified. charset = Utils.GetCharSet(mlist.preferred_language) - if charset == 'us-ascii': - charset = 'iso-8859-1' - charset = Charset(charset) - # Convert the string to unicode so Header will do the 3-charset encoding. - # If s is a byte string and there are funky characters in it that don't - # match the charset, we might as well replace them now. - if not _isunicode(s): - codec = charset.input_codec or 'ascii' - s = unicode(s, codec, 'replace') - # We purposefully leave no space b/w prefix and subject! + if nonascii.search(s): + # use list charset but ... + if charset == 'us-ascii': + charset = 'iso-8859-1' + else: + # there is no nonascii so ... + charset = 'us-ascii' return Header(s, charset, maxlinelen, header_name, continuation_ws) @@ -71,7 +72,12 @@ def process(mlist, msg, msgdata): # VirginRunner sets _fasttrack for internally crafted messages. fasttrack = msgdata.get('_fasttrack') if not msgdata.get('isdigest') and not fasttrack: - prefix_subject(mlist, msg, msgdata) + try: + prefix_subject(mlist, msg, msgdata) + except (UnicodeError, ValueError): + # TK: Sometimes subject header is not MIME encoded for 8bit + # simply abort prefixing. + pass # Mark message so we know we've been here, but leave any existing # X-BeenThere's intact. msg['X-BeenThere'] = mlist.GetListEmail() @@ -127,7 +133,7 @@ def process(mlist, msg, msgdata): # because some folks think that some MUAs make it easier to delete # addresses from the right than from the left. if mlist.reply_goes_to_list == 1: - i18ndesc = uheader(mlist, mlist.description) + i18ndesc = uheader(mlist, mlist.description, 'Reply-To') add((str(i18ndesc), mlist.GetListEmail())) del msg['reply-to'] # Don't put Reply-To: back if there's nothing to add! @@ -150,7 +156,7 @@ def process(mlist, msg, msgdata): d = {} for pair in getaddresses(msg.get_all('cc', [])): add(pair) - i18ndesc = uheader(mlist, mlist.description) + i18ndesc = uheader(mlist, mlist.description, 'Cc') add((str(i18ndesc), mlist.GetListEmail())) del msg['Cc'] msg['Cc'] = COMMASPACE.join([formataddr(pair) for pair in new]) @@ -165,15 +171,16 @@ def process(mlist, msg, msgdata): return # This will act like an email address for purposes of formataddr() listid = '%s.%s' % (mlist.internal_name(), mlist.host_name) + cset = Utils.GetCharSet(mlist.preferred_language) if mlist.description: # Don't wrap the header since here we just want to get it properly RFC # 2047 encoded. - h = uheader(mlist, mlist.description, 'List-Id', maxlinelen=10000) - desc = str(h) + i18ndesc = uheader(mlist, mlist.description, 'List-Id', maxlinelen=998) + listid_h = formataddr((str(i18ndesc), listid)) else: - desc = '' - listid_h = formataddr((desc, listid)) - # BAW: I think the message object should handle any necessary wrapping. + # without desc we need to ensure the MUST brackets + listid_h = '<%s>' % listid + # We always add a List-ID: header. del msg['list-id'] msg['List-Id'] = listid_h # For internally crafted messages, we @@ -218,7 +225,9 @@ def prefix_subject(mlist, msg, msgdata): # Add the subject prefix unless the message is a digest or is being fast # tracked (e.g. internally crafted, delivered to a single user such as the # list admin). - prefix = mlist.subject_prefix + prefix = mlist.subject_prefix.strip() + if not prefix: + return subject = msg.get('subject', '') # Try to figure out what the continuation_ws is for the header if isinstance(subject, Header): @@ -229,35 +238,91 @@ def prefix_subject(mlist, msg, msgdata): if len(lines) > 1 and lines[1] and lines[1][0] in ' \t': ws = lines[1][0] msgdata['origsubj'] = subject - if not subject: + # The subject may be multilingual but we take the first charset + # as major one and try to decode. If it is decodable, returned + # subject is in one line and cset is properly set. If fail, + # subject is mime-encoded and cset is set as us-ascii. See detail + # for ch_oneline() (CookHeaders one line function). + subject, cset = ch_oneline(subject) + # Note: searching prefix in subject is REMOVED. (seq version) + # If the subject_prefix contains '%d', it is replaced with the + # mailing list sequential number. Also, if the prefix is closed with + # [],(), or {}, the prefix in the responding post subject will be cared. + # sequential number format allows '%05d' like pattern. + p = re.compile('%\d*d') + if p.search(prefix,1): + # prefix have number, so we should search prefix w/number + # in subject. + prefix_pattern = p.sub(r'\s*\d+\s*', prefix) + else: + prefix_pattern = prefix + prefix_pattern = re.sub('([\[\(\{\)])', '\\\\\g<1>', prefix_pattern) + subject = re.sub(prefix_pattern, '', subject) + subject = re.compile('(RE:\s*)+', re.I).sub('Re: ', subject, 1) + # At this point, subject may become null if someone post mail with + # subject: [subject prefix] + if subject.strip() == '': subject = _('(no subject)') - # The header may be multilingual; decode it from base64/quopri and search - # each chunk for the prefix. BAW: Note that if the prefix contains spaces - # and each word of the prefix is encoded in a different chunk in the - # header, we won't find it. I think in practice that's unlikely though. - headerbits = decode_header(subject) - if prefix and subject: - pattern = re.escape(prefix.strip()) - for decodedsubj, charset in headerbits: - if re.search(pattern, decodedsubj, re.IGNORECASE): - # The subject's already got the prefix, so don't change it - return - del msg['subject'] + cset = Utils.GetCharSet(mlist.preferred_language) + # and substitute %d in prefix with post_id + try: + prefix = prefix % mlist.post_id + except TypeError: + pass + # If charset is 'us-ascii', try to concatnate as string because there + # is some weirdness in Header module (TK) + if cset == 'us-ascii': + try: + h = prefix + ' ' + subject + if type(h) == UnicodeType: + h = h.encode('us-ascii') + else: + h = unicode(h, 'us-ascii').encode('us-ascii') + del msg['subject'] + msg['Subject'] = h + return + except UnicodeError: + pass # Get the header as a Header instance, with proper unicode conversion h = uheader(mlist, prefix, 'Subject', continuation_ws=ws) - for s, c in headerbits: - # Once again, convert the string to unicode. - if c is None: - c = Charset('iso-8859-1') - if not isinstance(c, Charset): - c = Charset(c) - if not _isunicode(s): - codec = c.input_codec or 'ascii' - try: - s = unicode(s, codec, 'replace') - except LookupError: - # Unknown codec, is this default reasonable? - s = unicode(s, Utils.GetCharSet(mlist.preferred_language), - 'replace') - h.append(s, c) + # in seq version, subject header is already concatnated + if not _isunicode(subject): + try: + subject = unicode(subject, cset, 'replace') + except (LookupError, TypeError): + # unknown codec + cset = Utils.GetCharSet(mlist.preferred_language) + subject = unicode(subject, cset, 'replace') + subject = subject.encode(cset,'replace') + h.append(subject, cset) + del msg['subject'] msg['Subject'] = h + + +def ch_oneline(s): + # Decode header string in one line and convert into single charset + # copied and modified from ToDigest.py and Utils.py + # return (string, cset) tuple as check for failure + try: + d = decode_header(s) + # at this point, we should rstrip() every string because some + # MUA deliberately add trailing spaces when composing return + # message. + i = 0 + for (s,c) in d: + s = s.rstrip() + d[i] = (s,c) + i += 1 + cset = 'us-ascii' + for x in d: + # search for no-None charset + if x[1]: + cset = x[1] + break + h = make_header(d) + ustr = h.__unicode__() + oneline = u''.join(ustr.splitlines()) + return oneline.encode(cset, 'replace'), cset + except (LookupError, UnicodeError, ValueError, HeaderParseError): + # possibly charset problem. return with undecoded string in one line. + return ''.join(s.splitlines()), 'us-ascii' |