aboutsummaryrefslogtreecommitdiffstats
path: root/bin/cleanarch
blob: 85a8df6aaf4d01a332eb3f3bb405c25f7d075603 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#! @PYTHON@

# Copyright (C) 2001,2002 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software 
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

"""Clean up an .mbox archive file.

The archiver looks for Unix-From lines separating messages in an mbox archive
file.  For compatibility, it specifically looks for lines that start with
"From " -- i.e. the letters capital-F, lowercase-r, o, m, space, ignoring
everything else on the line.

Normally, any lines that start "From " in the body of a message should be
escaped such that a > character is actually the first on a line.  It is
possible though that body lines are not actually escaped.  This script
attempts to fix these by doing a stricter test of the Unix-From lines.  Any
lines that start "From " but do not pass this stricter test are escaped with a
> character.

Usage: cleanarch [options] < inputfile > outputfile
Options:
    -s n
    --status=n
        Print a # character every n lines processed

    -q / --quiet
        Don't print changed line information to standard error.

    -n / --dry-run
        Don't actually output anything.

    -h / --help
        Print this message and exit
"""

import sys
import re
import getopt
import mailbox

import paths
from Mailman.i18n import _

cre = re.compile(mailbox.UnixMailbox._fromlinepattern)

# From RFC 2822, a header field name must contain only characters from 33-126
# inclusive, excluding colon.  I.e. from oct 41 to oct 176 less oct 072.  Must
# use re.match() so that it's anchored at the beginning of the line.
fre = re.compile(r'[\041-\071\073-\0176]+')



def usage(code, msg=''):
    if code:
        fd = sys.stderr
    else:
        fd = sys.stdout
    print >> fd, _(__doc__)
    if msg:
        print >> fd, msg
    sys.exit(code)



def escape_line(line, lineno, quiet, output):
    if output:
        sys.stdout.write('>' + line)
    if not quiet:
        print >> sys.stderr, _('Unix-From line changed: %(lineno)d')
        print >> sys.stderr, line[:-1]



def main():
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], 'hqns:',
            ['help', 'quiet', 'dry-run', 'status='])
    except getopt.error, msg:
        usage(1, msg)

    quiet = 0
    output = 1
    status = -1

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt in ('-q', '--quiet'):
            quiet = 1
        elif opt in ('-n', '--dry-run'):
            output = 0
        elif opt in ('-s', '--status'):
            try:
                status = int(arg)
            except ValueError:
                usage(1, _('Bad status number: %(arg)s'))

    if args:
        usage(1)

    lineno = 0
    statuscnt = 0
    messages = 0
    while 1:
        lineno += 1
        line = sys.stdin.readline()
        if not line:
            break
        if line.startswith('From '):
            if cre.match(line):
                # This is a real Unix-From line.  But it could be a message
                # /about/ Unix-From lines, so as a second order test, make
                # sure there's at least one RFC 2822 header following
                nextline = sys.stdin.readline()
                lineno += 1
                if not nextline:
                    # It was the last line of the mbox, so it couldn't have
                    # been a Unix-From
                    escape_line(line, lineno, quiet, output)
                    break
                fieldname = nextline.split(':', 1)
                if len(fieldname) < 2 or not fre.match(nextline):
                    # The following line was not a header, so this wasn't a
                    # valid Unix-From
                    escape_line(line, lineno, quiet, output)
                    if output:
                        sys.stdout.write(nextline)
                else:
                    # It's a valid Unix-From line
                    messages += 1
                    if output:
                        sys.stdout.write(line)
                        sys.stdout.write(nextline)
            else:
                # This is a bogus Unix-From line
                escape_line(line, lineno, quiet, output)
        elif output:
            # Any old line
            sys.stdout.write(line)
        if status > 0 and (lineno % status) == 0:
            sys.stderr.write('#')
            statuscnt += 1
            if statuscnt > 50:
                print >> sys.stderr
                statuscnt = 0
    print >> sys.stderr, _('%(messages)d messages found')



if __name__ == '__main__':
    main()