From 76081b2566d1e183e2a325af8a5457553c687a12 Mon Sep 17 00:00:00 2001
From: tkikuchi <>
Date: Sat, 9 Oct 2004 12:54:00 +0000
Subject: [ 601117 ] add sequencial number in subject prefix Here is my major
 patch. It was postponed to 2.2 but since 2.2 is so late...

---
 Mailman/Gui/General.py          |   6 +-
 Mailman/Handlers/CookHeaders.py | 169 +++++++++++++++++++++++++++-------------
 2 files changed, 122 insertions(+), 53 deletions(-)

(limited to 'Mailman')

diff --git a/Mailman/Gui/General.py b/Mailman/Gui/General.py
index 4b587e05..833735cd 100644
--- a/Mailman/Gui/General.py
+++ b/Mailman/Gui/General.py
@@ -145,7 +145,11 @@ class General(GUIBase):
              posted to the list, to distinguish mailing list messages in in
              mailbox summaries.  Brevity is premium here, it's ok to shorten
              long mailing list names to something more concise, as long as it
-             still identifies the mailing list.""")),
+             still identifies the mailing list.
+             You can also add a sequencial number by %%d substitution
+             directive. eg.; [listname %%d] -> [listname 123]
+                            (listname %%05d) -> (listname 00123)
+             """)),
 
             ('anonymous_list', mm_cfg.Radio, (_('No'), _('Yes')), 0,
              _("""Hide the sender of a message, replacing it with the list
diff --git a/Mailman/Handlers/CookHeaders.py b/Mailman/Handlers/CookHeaders.py
index fa471166..c49c3175 100644
--- a/Mailman/Handlers/CookHeaders.py
+++ b/Mailman/Handlers/CookHeaders.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2003 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2004 by the Free Software Foundation, Inc.
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
@@ -22,8 +22,9 @@ import re
 from types import UnicodeType
 
 from email.Charset import Charset
-from email.Header import Header, decode_header
+from email.Header import Header, decode_header, make_header
 from email.Utils import parseaddr, formataddr, getaddresses
+from email.Errors import HeaderParseError
 
 from Mailman import mm_cfg
 from Mailman import Utils
@@ -39,21 +40,21 @@ MAXLINELEN = 78
 def _isunicode(s):
     return isinstance(s, UnicodeType)
 
+nonascii = re.compile('[^\s!-~]')
+
 def uheader(mlist, s, header_name=None, continuation_ws='\t', maxlinelen=None):
-    # Get the charset to encode the string in.  If this is us-ascii, we'll use
-    # iso-8859-1 instead, just to get a little extra coverage, and because the
-    # Header class tries us-ascii first anyway.
+    # Get the charset to encode the string in. Then search if there is any
+    # non-ascii character is in the string. If there is and the charset is 
+    # us-ascii then we use iso-8859-1 instead. If the string is ascii only
+    # we use 'us-ascii' if another charset is specified.
     charset = Utils.GetCharSet(mlist.preferred_language)
-    if charset == 'us-ascii':
-        charset = 'iso-8859-1'
-    charset = Charset(charset)
-    # Convert the string to unicode so Header will do the 3-charset encoding.
-    # If s is a byte string and there are funky characters in it that don't
-    # match the charset, we might as well replace them now.
-    if not _isunicode(s):
-        codec = charset.input_codec or 'ascii'
-        s = unicode(s, codec, 'replace')
-    # We purposefully leave no space b/w prefix and subject!
+    if nonascii.search(s):
+        # use list charset but ...
+        if charset == 'us-ascii':
+            charset = 'iso-8859-1'
+    else:
+        # there is no nonascii so ...
+        charset = 'us-ascii'
     return Header(s, charset, maxlinelen, header_name, continuation_ws)
 
 
@@ -71,7 +72,12 @@ def process(mlist, msg, msgdata):
     # VirginRunner sets _fasttrack for internally crafted messages.
     fasttrack = msgdata.get('_fasttrack')
     if not msgdata.get('isdigest') and not fasttrack:
-        prefix_subject(mlist, msg, msgdata)
+        try:
+            prefix_subject(mlist, msg, msgdata)
+        except (UnicodeError, ValueError):
+            # TK: Sometimes subject header is not MIME encoded for 8bit
+            # simply abort prefixing.
+            pass
     # Mark message so we know we've been here, but leave any existing
     # X-BeenThere's intact.
     msg['X-BeenThere'] = mlist.GetListEmail()
@@ -127,7 +133,7 @@ def process(mlist, msg, msgdata):
         # because some folks think that some MUAs make it easier to delete
         # addresses from the right than from the left.
         if mlist.reply_goes_to_list == 1:
-            i18ndesc = uheader(mlist, mlist.description)
+            i18ndesc = uheader(mlist, mlist.description, 'Reply-To')
             add((str(i18ndesc), mlist.GetListEmail()))
         del msg['reply-to']
         # Don't put Reply-To: back if there's nothing to add!
@@ -150,7 +156,7 @@ def process(mlist, msg, msgdata):
             d = {}
             for pair in getaddresses(msg.get_all('cc', [])):
                 add(pair)
-            i18ndesc = uheader(mlist, mlist.description)
+            i18ndesc = uheader(mlist, mlist.description, 'Cc')
             add((str(i18ndesc), mlist.GetListEmail()))
             del msg['Cc']
             msg['Cc'] = COMMASPACE.join([formataddr(pair) for pair in new])
@@ -165,15 +171,16 @@ def process(mlist, msg, msgdata):
         return
     # This will act like an email address for purposes of formataddr()
     listid = '%s.%s' % (mlist.internal_name(), mlist.host_name)
+    cset = Utils.GetCharSet(mlist.preferred_language)
     if mlist.description:
         # Don't wrap the header since here we just want to get it properly RFC
         # 2047 encoded.
-        h = uheader(mlist, mlist.description, 'List-Id', maxlinelen=10000)
-        desc = str(h)
+        i18ndesc = uheader(mlist, mlist.description, 'List-Id', maxlinelen=998)
+        listid_h = formataddr((str(i18ndesc), listid))
     else:
-        desc = ''
-    listid_h = formataddr((desc, listid))
-    # BAW: I think the message object should handle any necessary wrapping.
+       	# without desc we need to ensure the MUST brackets
+        listid_h = '<%s>' % listid
+    # We always add a List-ID: header.
     del msg['list-id']
     msg['List-Id'] = listid_h
     # For internally crafted messages, we
@@ -218,7 +225,9 @@ def prefix_subject(mlist, msg, msgdata):
     # Add the subject prefix unless the message is a digest or is being fast
     # tracked (e.g. internally crafted, delivered to a single user such as the
     # list admin).
-    prefix = mlist.subject_prefix
+    prefix = mlist.subject_prefix.strip()
+    if not prefix: 
+        return
     subject = msg.get('subject', '')
     # Try to figure out what the continuation_ws is for the header
     if isinstance(subject, Header):
@@ -229,35 +238,91 @@ def prefix_subject(mlist, msg, msgdata):
     if len(lines) > 1 and lines[1] and lines[1][0] in ' \t':
         ws = lines[1][0]
     msgdata['origsubj'] = subject
-    if not subject:
+    # The subject may be multilingual but we take the first charset
+    # as major one and try to decode. If it is decodable, returned
+    # subject is in one line and cset is properly set. If fail,
+    # subject is mime-encoded and cset is set as us-ascii. See detail
+    # for ch_oneline() (CookHeaders one line function).
+    subject, cset = ch_oneline(subject)
+    # Note: searching prefix in subject is REMOVED. (seq version)
+    # If the subject_prefix contains '%d', it is replaced with the
+    # mailing list sequential number. Also, if the prefix is closed with
+    # [],(), or {}, the prefix in the responding post subject will be cared.
+    # sequential number format allows '%05d' like pattern.
+    p = re.compile('%\d*d')
+    if p.search(prefix,1):
+        # prefix have number, so we should search prefix w/number
+        # in subject.
+        prefix_pattern = p.sub(r'\s*\d+\s*', prefix)
+    else:
+        prefix_pattern = prefix
+    prefix_pattern = re.sub('([\[\(\{\)])', '\\\\\g<1>', prefix_pattern)
+    subject = re.sub(prefix_pattern, '', subject)
+    subject = re.compile('(RE:\s*)+', re.I).sub('Re: ', subject, 1)
+    # At this point, subject may become null if someone post mail with
+    # subject: [subject prefix]
+    if subject.strip() == '':
         subject = _('(no subject)')
-    # The header may be multilingual; decode it from base64/quopri and search
-    # each chunk for the prefix.  BAW: Note that if the prefix contains spaces
-    # and each word of the prefix is encoded in a different chunk in the
-    # header, we won't find it.  I think in practice that's unlikely though.
-    headerbits = decode_header(subject)
-    if prefix and subject:
-        pattern = re.escape(prefix.strip())
-        for decodedsubj, charset in headerbits:
-            if re.search(pattern, decodedsubj, re.IGNORECASE):
-                # The subject's already got the prefix, so don't change it
-                return
-    del msg['subject']
+        cset = Utils.GetCharSet(mlist.preferred_language)
+    # and substitute %d in prefix with post_id
+    try:
+        prefix = prefix % mlist.post_id
+    except TypeError:
+        pass
+    # If charset is 'us-ascii', try to concatnate as string because there
+    # is some weirdness in Header module (TK)
+    if cset == 'us-ascii':
+        try:
+            h = prefix + ' ' + subject
+            if type(h) == UnicodeType:
+                h = h.encode('us-ascii')
+            else:
+                h = unicode(h, 'us-ascii').encode('us-ascii')
+            del msg['subject']
+            msg['Subject'] = h
+            return
+        except UnicodeError:
+            pass
     # Get the header as a Header instance, with proper unicode conversion
     h = uheader(mlist, prefix, 'Subject', continuation_ws=ws)
-    for s, c in headerbits:
-        # Once again, convert the string to unicode.
-        if c is None:
-            c = Charset('iso-8859-1')
-        if not isinstance(c, Charset):
-            c = Charset(c)
-        if not _isunicode(s):
-            codec = c.input_codec or 'ascii'
-            try:
-                s = unicode(s, codec, 'replace')
-            except LookupError:
-                # Unknown codec, is this default reasonable?
-                s = unicode(s, Utils.GetCharSet(mlist.preferred_language),
-                            'replace')
-        h.append(s, c)
+    # in seq version, subject header is already concatnated
+    if not _isunicode(subject):
+        try:
+            subject = unicode(subject, cset, 'replace')
+        except (LookupError, TypeError):
+            # unknown codec
+            cset = Utils.GetCharSet(mlist.preferred_language)
+            subject = unicode(subject, cset, 'replace')
+    subject = subject.encode(cset,'replace')
+    h.append(subject, cset)
+    del msg['subject']
     msg['Subject'] = h
+
+
+def ch_oneline(s):
+    # Decode header string in one line and convert into single charset
+    # copied and modified from ToDigest.py and Utils.py
+    # return (string, cset) tuple as check for failure
+    try:
+        d = decode_header(s)
+        # at this point, we should rstrip() every string because some
+        # MUA deliberately add trailing spaces when composing return
+        # message.
+        i = 0
+        for (s,c) in d:
+            s = s.rstrip()
+            d[i] = (s,c)
+            i += 1
+        cset = 'us-ascii'
+        for x in d:
+            # search for no-None charset
+            if x[1]:
+                cset = x[1]
+                break
+        h = make_header(d)
+        ustr = h.__unicode__()
+        oneline = u''.join(ustr.splitlines())
+        return oneline.encode(cset, 'replace'), cset
+    except (LookupError, UnicodeError, ValueError, HeaderParseError):
+        # possibly charset problem. return with undecoded string in one line.
+        return ''.join(s.splitlines()), 'us-ascii'
-- 
cgit v1.2.3