Mailman/Handlers/Tagger.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

# Copyright (C) 2001-2015 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software 
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

"""Extract topics from the original mail message.
"""

import re
import email
import email.Errors
import email.Iterators
import email.Parser

from email.Header import decode_header

from Mailman import Utils
from Mailman.Logging.Syslog import syslog
from Mailman.Handlers.CookHeaders import change_header

OR = '|'
CRNL = '\r\n'
EMPTYSTRING = ''
NLTAB = '\n\t'


def process(mlist, msg, msgdata):
    if not mlist.topics_enabled:
        return
    # Helper function.  Return RFC 2047 decoded header as a string in the
    # charset of the list's preferred language.
    def _decode(h):
        if not h:
            return h
        return Utils.oneline(h, Utils.GetCharSet(mlist.preferred_language))
    # Extract the Subject:, Keywords:, and possibly body text
    matchlines = []
    matchlines.append(_decode(msg.get('subject', None)))
    matchlines.append(_decode(msg.get('keywords', None)))
    if mlist.topics_bodylines_limit == 0:
        # Don't scan any body lines
        pass
    elif mlist.topics_bodylines_limit < 0:
        # Scan all body lines
        matchlines.extend(scanbody(msg))
    else:
        # Scan just some of the body lines
        matchlines.extend(scanbody(msg, mlist.topics_bodylines_limit))
    matchlines = filter(None, matchlines)
    # For each regular expression in the topics list, see if any of the lines
    # of interest from the message match the regexp.  If so, the message gets
    # added to the specific topics bucket.
    hits = {}
    for name, pattern, desc, emptyflag in mlist.topics:
        pattern = OR.join(pattern.splitlines())
        cre = re.compile(pattern, re.IGNORECASE)
        for line in matchlines:
            if cre.search(line):
                hits[name] = 1
                break
    if hits:
        msgdata['topichits'] = hits.keys()
        change_header('X-Topics', NLTAB.join(hits.keys()),
                      mlist, msg, msgdata, delete=False)


def scanbody(msg, numlines=None):
    # We only scan the body of the message if it is of MIME type text/plain,
    # or if the outer type is multipart/alternative and there is a text/plain
    # part.  Anything else, and the body is ignored for header-scan purposes.
    found = None
    if msg.get_content_type() == 'text/plain':
        found = msg
    elif (msg.is_multipart() and
          msg.get_content_type() == 'multipart/alternative'):
        for found in msg.get_payload():
            if found.get_content_type() == 'text/plain':
                break
        else:
            found = None
    if not found:
        return []
    # Now that we have a Message object that meets our criteria, let's extract
    # the first numlines of body text.
    lines = []
    lineno = 0
    reader = list(email.Iterators.body_line_iterator(msg, decode=True))
    while numlines is None or lineno < numlines:
        try:
            line = reader.pop(0)
        except IndexError:
            break
        # Blank lines don't count
        if not line.strip():
            continue
        lineno += 1
        lines.append(line)
    # Concatenate those body text lines with newlines, and then create a new
    # message object from those lines.
    p = _ForgivingParser()
    msg = p.parsestr(EMPTYSTRING.join(lines))
    return msg.get_all('subject', []) + msg.get_all('keywords', [])


class _ForgivingParser(email.Parser.HeaderParser):
    # Be a little more forgiving about non-header/continuation lines, since
    # we'll just read as much as we can from "header-like" lines in the body.
    #
    # BAW: WIBNI we didn't have to cut-n-paste this whole thing just to
    # specialize the way it returns?
    def _parseheaders(self, container, fp):
        # Parse the headers, returning a list of header/value pairs.  None as
        # the header means the Unix-From header.
        lastheader = ''
        lastvalue = []
        lineno = 0
        while 1:
            # Don't strip the line before we test for the end condition,
            # because whitespace-only header lines are RFC compliant
            # continuation lines.
            line = fp.readline()
            if not line:
                break
            line = line.splitlines()[0]
            if not line:
                break
            # Ignore the trailing newline
            lineno += 1
            # Check for initial Unix From_ line
            if line.startswith('From '):
                if lineno == 1:
                    container.set_unixfrom(line)
                    continue
                else:
                    break
            # Header continuation line
            if line[0] in ' \t':
                if not lastheader:
                    break
                lastvalue.append(line)
                continue
            # Normal, non-continuation header.  BAW: this should check to make
            # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
            # should expose the header matching algorithm in the API, and
            # allow for a non-strict parsing mode (that ignores the line
            # instead of raising the exception).
            i = line.find(':')
            if i < 0:
                break
            if lastheader:
                container[lastheader] = NLTAB.join(lastvalue)
            lastheader = line[:i]
            lastvalue = [line[i+1:].lstrip()]
        # Make sure we retain the last header
        if lastheader:
            container[lastheader] = NLTAB.join(lastvalue)