#! @PYTHON@ # Copyright (C) 2001-2003 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """Clean up an .mbox archive file. The archiver looks for Unix-From lines separating messages in an mbox archive file. For compatibility, it specifically looks for lines that start with "From " -- i.e. the letters capital-F, lowercase-r, o, m, space, ignoring everything else on the line. Normally, any lines that start "From " in the body of a message should be escaped such that a > character is actually the first on a line. It is possible though that body lines are not actually escaped. This script attempts to fix these by doing a stricter test of the Unix-From lines. Any lines that start "From " but do not pass this stricter test are escaped with a > character. Usage: cleanarch [options] < inputfile > outputfile Options: -s n --status=n Print a # character every n lines processed -q / --quiet Don't print changed line information to standard error. -n / --dry-run Don't actually output anything. -h / --help Print this message and exit """ import re import sys import getopt import mailbox import paths from Mailman.i18n import _ cre = re.compile(mailbox.UnixMailbox._fromlinepattern) # From RFC 2822, a header field name must contain only characters from 33-126 # inclusive, excluding colon. I.e. from oct 41 to oct 176 less oct 072. Must # use re.match() so that it's anchored at the beginning of the line. fre = re.compile(r'[\041-\071\073-\0176]+') def usage(code, msg=''): if code: fd = sys.stderr else: fd = sys.stdout print >> fd, _(__doc__) if msg: print >> fd, msg sys.exit(code) def escape_line(line, lineno, quiet, output): if output: sys.stdout.write('>' + line) if not quiet: print >> sys.stderr, _('Unix-From line changed: %(lineno)d') print >> sys.stderr, line[:-1] def main(): try: opts, args = getopt.getopt( sys.argv[1:], 'hqns:', ['help', 'quiet', 'dry-run', 'status=']) except getopt.error, msg: usage(1, msg) quiet = False output = True status = -1 for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-q', '--quiet'): quiet = True elif opt in ('-n', '--dry-run'): output = False elif opt in ('-s', '--status'): try: status = int(arg) except ValueError: usage(1, _('Bad status number: %(arg)s')) if args: usage(1) lineno = 0 statuscnt = 0 messages = 0 prevline = None while True: lineno += 1 line = sys.stdin.readline() if not line: break if line.startswith('From '): if cre.match(line): # This is a real Unix-From line. But it could be a message # /about/ Unix-From lines, so as a second order test, make # sure there's at least one RFC 2822 header following nextline = sys.stdin.readline() lineno += 1 if not nextline: # It was the last line of the mbox, so it couldn't have # been a Unix-From escape_line(line, lineno, quiet, output) break fieldname = nextline.split(':', 1) if len(fieldname) < 2 or not fre.match(nextline): # The following line was not a header, so this wasn't a # valid Unix-From escape_line(line, lineno, quiet, output) if output: sys.stdout.write(nextline) else: # It's a valid Unix-From line messages += 1 if output: # Before we spit out the From_ line, make sure the # previous line was blank. if prevline is not None and prevline <> '\n': sys.stdout.write('\n') sys.stdout.write(line) sys.stdout.write(nextline) else: # This is a bogus Unix-From line escape_line(line, lineno, quiet, output) elif output: # Any old line sys.stdout.write(line) if status > 0 and (lineno % status) == 0: sys.stderr.write('#') statuscnt += 1 if statuscnt > 50: print >> sys.stderr statuscnt = 0 prevline = line print >> sys.stderr, _('%(messages)d messages found') if __name__ == '__main__': main()