1 files changed, 117 insertions, 52 deletions
diff --git a/Mailman/Handlers/CookHeaders.py b/Mailman/Handlers/CookHeaders.py
index fa471166..c49c3175 100644
--- a/Mailman/Handlers/CookHeaders.py
+++ b/Mailman/Handlers/CookHeaders.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2003 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2004 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -22,8 +22,9 @@ import re
 from types import UnicodeType
 
 from email.Charset import Charset
-from email.Header import Header, decode_header
+from email.Header import Header, decode_header, make_header
 from email.Utils import parseaddr, formataddr, getaddresses
+from email.Errors import HeaderParseError
 
 from Mailman import mm_cfg
 from Mailman import Utils
@@ -39,21 +40,21 @@ MAXLINELEN = 78
 def _isunicode(s):
     return isinstance(s, UnicodeType)
 
+nonascii = re.compile('[^\s!-~]')
+
 def uheader(mlist, s, header_name=None, continuation_ws='\t', maxlinelen=None):
-    # Get the charset to encode the string in.  If this is us-ascii, we'll use
-    # iso-8859-1 instead, just to get a little extra coverage, and because the
-    # Header class tries us-ascii first anyway.
+    # Get the charset to encode the string in. Then search if there is any
+    # non-ascii character is in the string. If there is and the charset is 
+    # us-ascii then we use iso-8859-1 instead. If the string is ascii only
+    # we use 'us-ascii' if another charset is specified.
     charset = Utils.GetCharSet(mlist.preferred_language)
-    if charset == 'us-ascii':
-        charset = 'iso-8859-1'
-    charset = Charset(charset)
-    # Convert the string to unicode so Header will do the 3-charset encoding.
-    # If s is a byte string and there are funky characters in it that don't
-    # match the charset, we might as well replace them now.
-    if not _isunicode(s):
-        codec = charset.input_codec or 'ascii'
-        s = unicode(s, codec, 'replace')
-    # We purposefully leave no space b/w prefix and subject!
+    if nonascii.search(s):
+        # use list charset but ...
+        if charset == 'us-ascii':
+            charset = 'iso-8859-1'
+    else:
+        # there is no nonascii so ...
+        charset = 'us-ascii'
     return Header(s, charset, maxlinelen, header_name, continuation_ws)
 
 
@@ -71,7 +72,12 @@ def process(mlist, msg, msgdata):
     # VirginRunner sets _fasttrack for internally crafted messages.
     fasttrack = msgdata.get('_fasttrack')
     if not msgdata.get('isdigest') and not fasttrack:
-        prefix_subject(mlist, msg, msgdata)
+        try:
+            prefix_subject(mlist, msg, msgdata)
+        except (UnicodeError, ValueError):
+            # TK: Sometimes subject header is not MIME encoded for 8bit
+            # simply abort prefixing.
+            pass
     # Mark message so we know we've been here, but leave any existing
     # X-BeenThere's intact.
     msg['X-BeenThere'] = mlist.GetListEmail()
@@ -127,7 +133,7 @@ def process(mlist, msg, msgdata):
         # because some folks think that some MUAs make it easier to delete
         # addresses from the right than from the left.
         if mlist.reply_goes_to_list == 1:
-            i18ndesc = uheader(mlist, mlist.description)
+            i18ndesc = uheader(mlist, mlist.description, 'Reply-To')
             add((str(i18ndesc), mlist.GetListEmail()))
         del msg['reply-to']
         # Don't put Reply-To: back if there's nothing to add!
@@ -150,7 +156,7 @@ def process(mlist, msg, msgdata):
             d = {}
             for pair in getaddresses(msg.get_all('cc', [])):
                 add(pair)
-            i18ndesc = uheader(mlist, mlist.description)
+            i18ndesc = uheader(mlist, mlist.description, 'Cc')
             add((str(i18ndesc), mlist.GetListEmail()))
             del msg['Cc']
             msg['Cc'] = COMMASPACE.join([formataddr(pair) for pair in new])
@@ -165,15 +171,16 @@ def process(mlist, msg, msgdata):
         return
     # This will act like an email address for purposes of formataddr()
     listid = '%s.%s' % (mlist.internal_name(), mlist.host_name)
+    cset = Utils.GetCharSet(mlist.preferred_language)
     if mlist.description:
         # Don't wrap the header since here we just want to get it properly RFC
         # 2047 encoded.
-        h = uheader(mlist, mlist.description, 'List-Id', maxlinelen=10000)
-        desc = str(h)
+        i18ndesc = uheader(mlist, mlist.description, 'List-Id', maxlinelen=998)
+        listid_h = formataddr((str(i18ndesc), listid))
     else:
-        desc = ''
-    listid_h = formataddr((desc, listid))
-    # BAW: I think the message object should handle any necessary wrapping.
+       	# without desc we need to ensure the MUST brackets
+        listid_h = '<%s>' % listid
+    # We always add a List-ID: header.
     del msg['list-id']
     msg['List-Id'] = listid_h
     # For internally crafted messages, we
@@ -218,7 +225,9 @@ def prefix_subject(mlist, msg, msgdata):
     # Add the subject prefix unless the message is a digest or is being fast
     # tracked (e.g. internally crafted, delivered to a single user such as the
     # list admin).
-    prefix = mlist.subject_prefix
+    prefix = mlist.subject_prefix.strip()
+    if not prefix: 
+        return
     subject = msg.get('subject', '')
     # Try to figure out what the continuation_ws is for the header
     if isinstance(subject, Header):
@@ -229,35 +238,91 @@ def prefix_subject(mlist, msg, msgdata):
     if len(lines) > 1 and lines[1] and lines[1][0] in ' \t':
         ws = lines[1][0]
     msgdata['origsubj'] = subject
-    if not subject:
+    # The subject may be multilingual but we take the first charset
+    # as major one and try to decode. If it is decodable, returned
+    # subject is in one line and cset is properly set. If fail,
+    # subject is mime-encoded and cset is set as us-ascii. See detail
+    # for ch_oneline() (CookHeaders one line function).
+    subject, cset = ch_oneline(subject)
+    # Note: searching prefix in subject is REMOVED. (seq version)
+    # If the subject_prefix contains '%d', it is replaced with the
+    # mailing list sequential number. Also, if the prefix is closed with
+    # [],(), or {}, the prefix in the responding post subject will be cared.
+    # sequential number format allows '%05d' like pattern.
+    p = re.compile('%\d*d')
+    if p.search(prefix,1):
+        # prefix have number, so we should search prefix w/number
+        # in subject.
+        prefix_pattern = p.sub(r'\s*\d+\s*', prefix)
+    else:
+        prefix_pattern = prefix
+    prefix_pattern = re.sub('([\[\(\{\)])', '\\\\\g<1>', prefix_pattern)
+    subject = re.sub(prefix_pattern, '', subject)
+    subject = re.compile('(RE:\s*)+', re.I).sub('Re: ', subject, 1)
+    # At this point, subject may become null if someone post mail with
+    # subject: [subject prefix]
+    if subject.strip() == '':
         subject = _('(no subject)')
-    # The header may be multilingual; decode it from base64/quopri and search
-    # each chunk for the prefix.  BAW: Note that if the prefix contains spaces
-    # and each word of the prefix is encoded in a different chunk in the
-    # header, we won't find it.  I think in practice that's unlikely though.
-    headerbits = decode_header(subject)
-    if prefix and subject:
-        pattern = re.escape(prefix.strip())
-        for decodedsubj, charset in headerbits:
-            if re.search(pattern, decodedsubj, re.IGNORECASE):
-                # The subject's already got the prefix, so don't change it
-                return
-    del msg['subject']
+        cset = Utils.GetCharSet(mlist.preferred_language)
+    # and substitute %d in prefix with post_id
+    try:
+        prefix = prefix % mlist.post_id
+    except TypeError:
+        pass
+    # If charset is 'us-ascii', try to concatnate as string because there
+    # is some weirdness in Header module (TK)
+    if cset == 'us-ascii':
+        try:
+            h = prefix + ' ' + subject
+            if type(h) == UnicodeType:
+                h = h.encode('us-ascii')
+            else:
+                h = unicode(h, 'us-ascii').encode('us-ascii')
+            del msg['subject']
+            msg['Subject'] = h
+            return
+        except UnicodeError:
+            pass
     # Get the header as a Header instance, with proper unicode conversion
     h = uheader(mlist, prefix, 'Subject', continuation_ws=ws)
-    for s, c in headerbits:
-        # Once again, convert the string to unicode.
-        if c is None:
-            c = Charset('iso-8859-1')
-        if not isinstance(c, Charset):
-            c = Charset(c)
-        if not _isunicode(s):
-            codec = c.input_codec or 'ascii'
-            try:
-                s = unicode(s, codec, 'replace')
-            except LookupError:
-                # Unknown codec, is this default reasonable?
-                s = unicode(s, Utils.GetCharSet(mlist.preferred_language),
-                            'replace')
-        h.append(s, c)
+    # in seq version, subject header is already concatnated
+    if not _isunicode(subject):
+        try:
+            subject = unicode(subject, cset, 'replace')
+        except (LookupError, TypeError):
+            # unknown codec
+            cset = Utils.GetCharSet(mlist.preferred_language)
+            subject = unicode(subject, cset, 'replace')
+    subject = subject.encode(cset,'replace')
+    h.append(subject, cset)
+    del msg['subject']
     msg['Subject'] = h
+
+
+def ch_oneline(s):
+    # Decode header string in one line and convert into single charset
+    # copied and modified from ToDigest.py and Utils.py
+    # return (string, cset) tuple as check for failure
+    try:
+        d = decode_header(s)
+        # at this point, we should rstrip() every string because some
+        # MUA deliberately add trailing spaces when composing return
+        # message.
+        i = 0
+        for (s,c) in d:
+            s = s.rstrip()
+            d[i] = (s,c)
+            i += 1
+        cset = 'us-ascii'
+        for x in d:
+            # search for no-None charset
+            if x[1]:
+                cset = x[1]
+                break
+        h = make_header(d)
+        ustr = h.__unicode__()
+        oneline = u''.join(ustr.splitlines())
+        return oneline.encode(cset, 'replace'), cset
+    except (LookupError, UnicodeError, ValueError, HeaderParseError):
+        # possibly charset problem. return with undecoded string in one line.
+        return ''.join(s.splitlines()), 'us-ascii'