aboutsummaryrefslogtreecommitdiffstats
path: root/Mailman/Handlers/CookHeaders.py
diff options
context:
space:
mode:
Diffstat (limited to 'Mailman/Handlers/CookHeaders.py')
-rw-r--r--Mailman/Handlers/CookHeaders.py169
1 files changed, 117 insertions, 52 deletions
diff --git a/Mailman/Handlers/CookHeaders.py b/Mailman/Handlers/CookHeaders.py
index fa471166..c49c3175 100644
--- a/Mailman/Handlers/CookHeaders.py
+++ b/Mailman/Handlers/CookHeaders.py
@@ -1,4 +1,4 @@
-# Copyright (C) 1998-2003 by the Free Software Foundation, Inc.
+# Copyright (C) 1998-2004 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
@@ -22,8 +22,9 @@ import re
from types import UnicodeType
from email.Charset import Charset
-from email.Header import Header, decode_header
+from email.Header import Header, decode_header, make_header
from email.Utils import parseaddr, formataddr, getaddresses
+from email.Errors import HeaderParseError
from Mailman import mm_cfg
from Mailman import Utils
@@ -39,21 +40,21 @@ MAXLINELEN = 78
def _isunicode(s):
return isinstance(s, UnicodeType)
+nonascii = re.compile('[^\s!-~]')
+
def uheader(mlist, s, header_name=None, continuation_ws='\t', maxlinelen=None):
- # Get the charset to encode the string in. If this is us-ascii, we'll use
- # iso-8859-1 instead, just to get a little extra coverage, and because the
- # Header class tries us-ascii first anyway.
+ # Get the charset to encode the string in. Then search if there is any
+ # non-ascii character is in the string. If there is and the charset is
+ # us-ascii then we use iso-8859-1 instead. If the string is ascii only
+ # we use 'us-ascii' if another charset is specified.
charset = Utils.GetCharSet(mlist.preferred_language)
- if charset == 'us-ascii':
- charset = 'iso-8859-1'
- charset = Charset(charset)
- # Convert the string to unicode so Header will do the 3-charset encoding.
- # If s is a byte string and there are funky characters in it that don't
- # match the charset, we might as well replace them now.
- if not _isunicode(s):
- codec = charset.input_codec or 'ascii'
- s = unicode(s, codec, 'replace')
- # We purposefully leave no space b/w prefix and subject!
+ if nonascii.search(s):
+ # use list charset but ...
+ if charset == 'us-ascii':
+ charset = 'iso-8859-1'
+ else:
+ # there is no nonascii so ...
+ charset = 'us-ascii'
return Header(s, charset, maxlinelen, header_name, continuation_ws)
@@ -71,7 +72,12 @@ def process(mlist, msg, msgdata):
# VirginRunner sets _fasttrack for internally crafted messages.
fasttrack = msgdata.get('_fasttrack')
if not msgdata.get('isdigest') and not fasttrack:
- prefix_subject(mlist, msg, msgdata)
+ try:
+ prefix_subject(mlist, msg, msgdata)
+ except (UnicodeError, ValueError):
+ # TK: Sometimes subject header is not MIME encoded for 8bit
+ # simply abort prefixing.
+ pass
# Mark message so we know we've been here, but leave any existing
# X-BeenThere's intact.
msg['X-BeenThere'] = mlist.GetListEmail()
@@ -127,7 +133,7 @@ def process(mlist, msg, msgdata):
# because some folks think that some MUAs make it easier to delete
# addresses from the right than from the left.
if mlist.reply_goes_to_list == 1:
- i18ndesc = uheader(mlist, mlist.description)
+ i18ndesc = uheader(mlist, mlist.description, 'Reply-To')
add((str(i18ndesc), mlist.GetListEmail()))
del msg['reply-to']
# Don't put Reply-To: back if there's nothing to add!
@@ -150,7 +156,7 @@ def process(mlist, msg, msgdata):
d = {}
for pair in getaddresses(msg.get_all('cc', [])):
add(pair)
- i18ndesc = uheader(mlist, mlist.description)
+ i18ndesc = uheader(mlist, mlist.description, 'Cc')
add((str(i18ndesc), mlist.GetListEmail()))
del msg['Cc']
msg['Cc'] = COMMASPACE.join([formataddr(pair) for pair in new])
@@ -165,15 +171,16 @@ def process(mlist, msg, msgdata):
return
# This will act like an email address for purposes of formataddr()
listid = '%s.%s' % (mlist.internal_name(), mlist.host_name)
+ cset = Utils.GetCharSet(mlist.preferred_language)
if mlist.description:
# Don't wrap the header since here we just want to get it properly RFC
# 2047 encoded.
- h = uheader(mlist, mlist.description, 'List-Id', maxlinelen=10000)
- desc = str(h)
+ i18ndesc = uheader(mlist, mlist.description, 'List-Id', maxlinelen=998)
+ listid_h = formataddr((str(i18ndesc), listid))
else:
- desc = ''
- listid_h = formataddr((desc, listid))
- # BAW: I think the message object should handle any necessary wrapping.
+ # without desc we need to ensure the MUST brackets
+ listid_h = '<%s>' % listid
+ # We always add a List-ID: header.
del msg['list-id']
msg['List-Id'] = listid_h
# For internally crafted messages, we
@@ -218,7 +225,9 @@ def prefix_subject(mlist, msg, msgdata):
# Add the subject prefix unless the message is a digest or is being fast
# tracked (e.g. internally crafted, delivered to a single user such as the
# list admin).
- prefix = mlist.subject_prefix
+ prefix = mlist.subject_prefix.strip()
+ if not prefix:
+ return
subject = msg.get('subject', '')
# Try to figure out what the continuation_ws is for the header
if isinstance(subject, Header):
@@ -229,35 +238,91 @@ def prefix_subject(mlist, msg, msgdata):
if len(lines) > 1 and lines[1] and lines[1][0] in ' \t':
ws = lines[1][0]
msgdata['origsubj'] = subject
- if not subject:
+ # The subject may be multilingual but we take the first charset
+ # as major one and try to decode. If it is decodable, returned
+ # subject is in one line and cset is properly set. If fail,
+ # subject is mime-encoded and cset is set as us-ascii. See detail
+ # for ch_oneline() (CookHeaders one line function).
+ subject, cset = ch_oneline(subject)
+ # Note: searching prefix in subject is REMOVED. (seq version)
+ # If the subject_prefix contains '%d', it is replaced with the
+ # mailing list sequential number. Also, if the prefix is closed with
+ # [],(), or {}, the prefix in the responding post subject will be cared.
+ # sequential number format allows '%05d' like pattern.
+ p = re.compile('%\d*d')
+ if p.search(prefix,1):
+ # prefix have number, so we should search prefix w/number
+ # in subject.
+ prefix_pattern = p.sub(r'\s*\d+\s*', prefix)
+ else:
+ prefix_pattern = prefix
+ prefix_pattern = re.sub('([\[\(\{\)])', '\\\\\g<1>', prefix_pattern)
+ subject = re.sub(prefix_pattern, '', subject)
+ subject = re.compile('(RE:\s*)+', re.I).sub('Re: ', subject, 1)
+ # At this point, subject may become null if someone post mail with
+ # subject: [subject prefix]
+ if subject.strip() == '':
subject = _('(no subject)')
- # The header may be multilingual; decode it from base64/quopri and search
- # each chunk for the prefix. BAW: Note that if the prefix contains spaces
- # and each word of the prefix is encoded in a different chunk in the
- # header, we won't find it. I think in practice that's unlikely though.
- headerbits = decode_header(subject)
- if prefix and subject:
- pattern = re.escape(prefix.strip())
- for decodedsubj, charset in headerbits:
- if re.search(pattern, decodedsubj, re.IGNORECASE):
- # The subject's already got the prefix, so don't change it
- return
- del msg['subject']
+ cset = Utils.GetCharSet(mlist.preferred_language)
+ # and substitute %d in prefix with post_id
+ try:
+ prefix = prefix % mlist.post_id
+ except TypeError:
+ pass
+ # If charset is 'us-ascii', try to concatnate as string because there
+ # is some weirdness in Header module (TK)
+ if cset == 'us-ascii':
+ try:
+ h = prefix + ' ' + subject
+ if type(h) == UnicodeType:
+ h = h.encode('us-ascii')
+ else:
+ h = unicode(h, 'us-ascii').encode('us-ascii')
+ del msg['subject']
+ msg['Subject'] = h
+ return
+ except UnicodeError:
+ pass
# Get the header as a Header instance, with proper unicode conversion
h = uheader(mlist, prefix, 'Subject', continuation_ws=ws)
- for s, c in headerbits:
- # Once again, convert the string to unicode.
- if c is None:
- c = Charset('iso-8859-1')
- if not isinstance(c, Charset):
- c = Charset(c)
- if not _isunicode(s):
- codec = c.input_codec or 'ascii'
- try:
- s = unicode(s, codec, 'replace')
- except LookupError:
- # Unknown codec, is this default reasonable?
- s = unicode(s, Utils.GetCharSet(mlist.preferred_language),
- 'replace')
- h.append(s, c)
+ # in seq version, subject header is already concatnated
+ if not _isunicode(subject):
+ try:
+ subject = unicode(subject, cset, 'replace')
+ except (LookupError, TypeError):
+ # unknown codec
+ cset = Utils.GetCharSet(mlist.preferred_language)
+ subject = unicode(subject, cset, 'replace')
+ subject = subject.encode(cset,'replace')
+ h.append(subject, cset)
+ del msg['subject']
msg['Subject'] = h
+
+
+def ch_oneline(s):
+ # Decode header string in one line and convert into single charset
+ # copied and modified from ToDigest.py and Utils.py
+ # return (string, cset) tuple as check for failure
+ try:
+ d = decode_header(s)
+ # at this point, we should rstrip() every string because some
+ # MUA deliberately add trailing spaces when composing return
+ # message.
+ i = 0
+ for (s,c) in d:
+ s = s.rstrip()
+ d[i] = (s,c)
+ i += 1
+ cset = 'us-ascii'
+ for x in d:
+ # search for no-None charset
+ if x[1]:
+ cset = x[1]
+ break
+ h = make_header(d)
+ ustr = h.__unicode__()
+ oneline = u''.join(ustr.splitlines())
+ return oneline.encode(cset, 'replace'), cset
+ except (LookupError, UnicodeError, ValueError, HeaderParseError):
+ # possibly charset problem. return with undecoded string in one line.
+ return ''.join(s.splitlines()), 'us-ascii'