1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
# Copyright (C) 2001-2011 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""Extract topics from the original mail message.
"""
import re
import email
import email.Errors
import email.Iterators
import email.Parser
from email.Header import decode_header
from Mailman import Utils
from Mailman.Logging.Syslog import syslog
CRNL = '\r\n'
EMPTYSTRING = ''
NLTAB = '\n\t'
def process(mlist, msg, msgdata):
if not mlist.topics_enabled:
return
# Helper function. Return RFC 2047 decoded header as a string in the
# charset of the list's preferred language.
def _decode(h):
if not h:
return h
return Utils.oneline(h, Utils.GetCharSet(mlist.preferred_language))
# Extract the Subject:, Keywords:, and possibly body text
matchlines = []
matchlines.append(_decode(msg.get('subject', None)))
matchlines.append(_decode(msg.get('keywords', None)))
if mlist.topics_bodylines_limit == 0:
# Don't scan any body lines
pass
elif mlist.topics_bodylines_limit < 0:
# Scan all body lines
matchlines.extend(scanbody(msg))
else:
# Scan just some of the body lines
matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit))
matchlines = filter(None, matchlines)
# For each regular expression in the topics list, see if any of the lines
# of interest from the message match the regexp. If so, the message gets
# added to the specific topics bucket.
hits = {}
for name, pattern, desc, emptyflag in mlist.topics:
cre = re.compile(pattern, re.IGNORECASE | re.VERBOSE)
for line in matchlines:
if cre.search(line):
hits[name] = 1
break
if hits:
msgdata['topichits'] = hits.keys()
msg['X-Topics'] = NLTAB.join(hits.keys())
def scanbody(msg, numlines=None):
# We only scan the body of the message if it is of MIME type text/plain,
# or if the outer type is multipart/alternative and there is a text/plain
# part. Anything else, and the body is ignored for header-scan purposes.
found = None
if msg.get_content_type() == 'text/plain':
found = msg
elif (msg.is_multipart() and
msg.get_content_type() == 'multipart/alternative'):
for found in msg.get_payload():
if found.get_content_type() == 'text/plain':
break
else:
found = None
if not found:
return []
# Now that we have a Message object that meets our criteria, let's extract
# the first numlines of body text.
lines = []
lineno = 0
reader = list(email.Iterators.body_line_iterator(msg, decode=True))
while numlines is None or lineno < numlines:
try:
line = reader.pop(0)
except IndexError:
break
# Blank lines don't count
if not line.strip():
continue
lineno += 1
lines.append(line)
# Concatenate those body text lines with newlines, and then create a new
# message object from those lines.
p = _ForgivingParser()
msg = p.parsestr(EMPTYSTRING.join(lines))
return msg.get_all('subject', []) + msg.get_all('keywords', [])
class _ForgivingParser(email.Parser.HeaderParser):
# Be a little more forgiving about non-header/continuation lines, since
# we'll just read as much as we can from "header-like" lines in the body.
#
# BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
# specialize the way it returns?
def _parseheaders(self, container, fp):
# Parse the headers, returning a list of header/value pairs. None as
# the header means the Unix-From header.
lastheader = ''
lastvalue = []
lineno = 0
while 1:
# Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant
# continuation lines.
line = fp.readline()
if not line:
break
line = line.splitlines()[0]
if not line:
break
# Ignore the trailing newline
lineno += 1
# Check for initial Unix From_ line
if line.startswith('From '):
if lineno == 1:
container.set_unixfrom(line)
continue
else:
break
# Header continuation line
if line[0] in ' \t':
if not lastheader:
break
lastvalue.append(line)
continue
# Normal, non-continuation header. BAW: this should check to make
# sure it's a legal header, e.g. doesn't contain spaces. Also, we
# should expose the header matching algorithm in the API, and
# allow for a non-strict parsing mode (that ignores the line
# instead of raising the exception).
i = line.find(':')
if i < 0:
break
if lastheader:
container[lastheader] = NLTAB.join(lastvalue)
lastheader = line[:i]
lastvalue = [line[i+1:].lstrip()]
# Make sure we retain the last header
if lastheader:
container[lastheader] = NLTAB.join(lastvalue)
|