diff options
Diffstat (limited to 'bin/mailmanctl')
-rw-r--r-- | bin/mailmanctl | 524 |
1 files changed, 524 insertions, 0 deletions
diff --git a/bin/mailmanctl b/bin/mailmanctl new file mode 100644 index 00000000..0292e1f3 --- /dev/null +++ b/bin/mailmanctl @@ -0,0 +1,524 @@ +#! @PYTHON@ + +# Copyright (C) 2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +"""Primary start-up and shutdown script for Mailman's qrunner daemon. + +This script starts, stops, and restarts the main Mailman queue runners, making +sure that the various long-running qrunners are still alive and kicking. It +does this by forking and exec'ing the qrunners and waiting on their pids. +When it detects a subprocess has exited, it may restart it. + +The qrunners respond to SIGINT, SIGTERM, and SIGHUP. SIGINT and SIGTERM both +cause the qrunners to exit cleanly, but the master will only restart qrunners +that have exited due to a SIGINT. SIGHUP causes the master and the qrunners +to close their log files, and reopen then upon the next printed message. + +The master also responds to SIGINT, SIGTERM, and SIGHUP, which it simply +passes on to the qrunners (note that the master will close and reopen its own +log files on receipt of a SIGHUP). The master also leaves its own process id +in the file data/master-qrunner.pid but you normally don't need to use this +pid directly. The `start', `stop', `restart', and `reopen' commands handle +everything for you. + +Usage: %(PROGRAM)s [options] [ start | stop | restart | reopen ] + +Options: + + -n/--no-restart + Don't restart the qrunners when they exit because of an error or a + SIGINT. They are never restarted if they exit in response to a + SIGTERM. Use this only for debugging. Only useful if the `start' + command is given. + + -u/--run-as-user + Normally, this script will refuse to run if the user id and group id + are not set to the `mailman' user and group (as defined when you + configured Mailman). If run as root, this script will change to this + user and group before the check is made. + + This can be inconvenient for testing and debugging purposes, so the -u + flag means that the step that sets and checks the uid/gid is skipped, + and the program is run as the current user and group. This flag is + not recommended for normal production environments. + + Note though, that if you run with -u and are not in the mailman group, + you may have permission problems, such as begin unable to delete a + list's archives through the web. Tough luck! + + -s/--stale-lock-cleanup + If mailmanctl finds an existing master lock, it will normally exit + with an error message. With this option, mailmanctl will perform an + extra level of checking. If a process matching the host/pid described + in the lock file is running, mailmanctl will still exit, but if no + matching process is found, mailmanctl will remove the apparently stale + lock and make another attempt to claim the master lock. + + -q/--quiet + Don't print status messages. Error messages are still printed to + standard error. + + -h/--help + Print this message and exit. + +Commands: + + start - Start the master daemon and all qrunners. Prints a message and + exits if the master daemon is already running. + + stop - Stops the master daemon and all qrunners. After stopping, no + more messages will be processed. + + restart - Restarts the qrunners, but not the master process. Use this + whenever you upgrade or update Mailman so that the qrunners will + use the newly installed code. + + reopen - This will close all log files, causing them to be re-opened the + next time a message is written to them +""" + +import sys +import os +import time +import getopt +import signal +import errno +import pwd +import grp +import socket + +import paths +from Mailman import mm_cfg +from Mailman import Utils +from Mailman import LockFile +from Mailman.i18n import _ +from Mailman.Logging.Syslog import syslog +from Mailman.Logging.Utils import LogStdErr + +PROGRAM = sys.argv[0] +COMMASPACE = ', ' +DOT = '.' + +# Locking contantsa +LOCKFILE = os.path.join(mm_cfg.LOCK_DIR, 'master-qrunner') +# Since we wake up once per day and refresh the lock, the LOCK_LIFETIME +# needn't be (much) longer than SNOOZE. We pad it 6 hours just to be safe. +LOCK_LIFETIME = mm_cfg.days(1) + mm_cfg.hours(6) +SNOOZE = mm_cfg.days(1) +MAX_RESTARTS = 10 + +LogStdErr('error', 'mailmanctl', manual_reprime=0) + + + +def usage(code, msg=''): + if code: + fd = sys.stderr + else: + fd = sys.stdout + print >> fd, _(__doc__) + if msg: + print >> fd, msg + sys.exit(code) + + + +def kill_watcher(sig): + try: + fp = open(mm_cfg.PIDFILE) + pidstr = fp.read() + fp.close() + pid = int(pidstr.strip()) + except (IOError, ValueError), e: + # For i18n convenience + pidfile = mm_cfg.PIDFILE + print >> sys.stderr, _('PID unreadable in: %(pidfile)s') + print >> sys.stderr, e + print >> sys.stderr, _('Is qrunner even running?') + return + try: + os.kill(pid, sig) + except OSError, e: + if e.errno <> errno.ESRCH: raise + print >> sys.stderr, _('No child with pid: %(pid)s') + print >> sys.stderr, e + print >> sys.stderr, _('Stale pid file removed.') + os.unlink(mm_cfg.PIDFILE) + + + +def get_lock_data(): + # Return the hostname, pid, and tempfile + fp = open(LOCKFILE) + filename = os.path.split(fp.read().strip())[1] + fp.close() + parts = filename.split('.') + hostname = DOT.join(parts[1:-1]) + pid = int(parts[-1]) + return hostname, int(pid), filename + + +def qrunner_state(): + # 1 if proc exists on host (but is it qrunner? ;) + # 0 if host matches but no proc + # hostname if hostname doesn't match + hostname, pid, tempfile = get_lock_data() + if hostname <> socket.gethostname(): + return hostname + # Find out if the process exists by calling kill with a signal 0. + try: + os.kill(pid, 0) + except OSError, e: + if e.errno <> errno.ESRCH: raise + return 0 + return 1 + + +def acquire_lock_1(force): + # Be sure we can acquire the master qrunner lock. If not, it means some + # other master qrunner daemon is already going. + lock = LockFile.LockFile(LOCKFILE, LOCK_LIFETIME) + try: + lock.lock(0.1) + return lock + except LockFile.TimeOutError: + if not force: + raise + # Force removal of lock first + lock._disown() + hostname, pid, tempfile = get_lock_data() + os.unlink(LOCKFILE) + os.unlink(os.path.join(mm_cfg.LOCK_DIR, tempfile)) + return acquire_lock_1(force=0) + + +def acquire_lock(force): + try: + lock = acquire_lock_1(force) + return lock + except LockFile.TimeOutError: + status = qrunner_state() + if status == 1: + # host matches and proc exists + print >> sys.stderr, _("""\ +The master qrunner lock could not be acquired because it appears as if another +master qrunner is already running. +""") + elif status == 0: + # host matches but no proc + print >> sys.stderr, _("""\ +The master qrunner lock could not be acquired. It appears as though there is +a stale master qrunner lock. Try re-running mailmanctl with the -s flag. +""") + else: + # host doesn't even match + print >> sys.stderr, _("""\ +The master qrunner lock could not be acquired, because it appears as if some +process on some other host may have acquired it. We can't test for stale +locks across host boundaries, so you'll have to do this manually. Or, if you +know the lock is stale, re-run mailmanctl with the -s flag. + +Lock file: %(LOCKFILE)s +Lock host: %(status)s + +Exiting.""") + + + +def start_runner(qrname, slice, count): + pid = os.fork() + if pid: + # parent + return pid + # child + # + # Craft the command line arguments for the exec() call. + rswitch = '--runner=%s:%d:%d' % (qrname, slice, count) + # BAW: should argv[0] be `python'? + exe = os.path.join(mm_cfg.BIN_DIR, 'qrunner') + os.execl(mm_cfg.PYTHON, 'qrunner', exe, rswitch, '-s') + # Should never get here + raise RuntimeError, 'os.execl() failed' + + +def start_all_runners(): + kids = {} + for qrname, count in mm_cfg.QRUNNERS: + for slice in range(count): + # queue runner name, slice, numslices, restart count + info = (qrname, slice, count, 0) + pid = start_runner(qrname, slice, count) + kids[pid] = info + return kids + + + +def check_privs(): + # If we're running as root (uid == 0), coerce the uid and gid to that + # which Mailman was configured for, and refuse to run if we didn't coerce + # the uid/gid. + gid = grp.getgrnam(mm_cfg.MAILMAN_GROUP)[2] + uid = pwd.getpwnam(mm_cfg.MAILMAN_USER)[2] + myuid = os.getuid() + if myuid == 0: + os.setgid(gid) + os.setuid(uid) + elif myuid <> uid: + name = mm_cfg.MAILMAN_USER + usage(1, _( + 'Run this program as root or as the %(name)s user, or use -u.')) + + + +def main(): + global quiet + try: + opts, args = getopt.getopt(sys.argv[1:], 'hnusq', + ['help', 'no-start', 'run-as-user', + 'stale-lock-cleanup', 'quiet']) + except getopt.error, msg: + usage(1, msg) + + restart = 1 + checkprivs = 1 + force = 0 + quiet = 0 + for opt, arg in opts: + if opt in ('-h', '--help'): + usage(0) + elif opt in ('-n', '--no-restart'): + restart = 0 + elif opt in ('-u', '--run-as-user'): + checkprivs = 0 + elif opt in ('-s', '--stale-lock-cleanup'): + force = 1 + elif opt in ('-q', '--quiet'): + quiet = 1 + + if len(args) < 1: + usage(1, _('No command given.')) + elif len(args) > 1: + command = COMMASPACE.join(args) + usage(1, _('Bad command: %(command)s')) + + if checkprivs: + check_privs() + else: + print _('Warning! You may encounter permission problems.') + + # Handle the commands + command = args[0].lower() + if command == 'stop': + # Sent the master qrunner process a SIGINT, which is equivalent to + # giving cron/qrunner a ctrl-c or KeyboardInterrupt. This will + # effectively shut everything down. + if not quiet: + print _("Shutting down Mailman's master qrunner") + kill_watcher(signal.SIGTERM) + elif command == 'restart': + # Sent the master qrunner process a SIGHUP. This will cause the + # master qrunner to kill and restart all the worker qrunners, and to + # close and re-open its log files. + if not quiet: + print _("Restarting Mailman's master qrunner") + kill_watcher(signal.SIGINT) + elif command == 'reopen': + if not quiet: + print _('Re-opening all log files') + kill_watcher(signal.SIGHUP) + elif command == 'start': + # Here's the scoop on the processes we're about to create. We'll need + # one for each qrunner, and one for a master child process watcher / + # lock refresher process. + # + # The child watcher process simply waits on the pids of the children + # qrunners. Unless explicitly disabled by a mailmanctl switch (or the + # children are killed with SIGTERM instead of SIGINT), the watcher + # will automatically restart any child process that exits. This + # allows us to be more robust, and also to implement restart by simply + # SIGINT'ing the qrunner children, and letting the watcher restart + # them. + # + # Under normal operation, we have a child per queue. This lets us get + # the most out of the available resources, since a qrunner with no + # files in its queue directory is pretty cheap, but having a separate + # runner process per queue allows for a very responsive system. Some + # people want a more traditional (i.e. MM2.0.x) cron-invoked qrunner. + # No problem, but using mailmanctl isn't the answer. So while + # mailmanctl hard codes some things, others, such as the number of + # qrunners per queue, is configurable in mm_cfg.py. + # + # First, acquire the master mailmanctl lock + lock = acquire_lock(force) + if not lock: + return + # Daemon process startup according to Stevens, Advanced Programming in + # the UNIX Environment, Chapter 13. + pid = os.fork() + if pid: + # parent + if not quiet: + print _("Starting Mailman's master qrunner.") + # Give up the lock "ownership". This just means the foreground + # process won't close/unlock the lock when it finalizes this lock + # instance. We'll let the mater watcher subproc own the lock. + lock._transfer_to(pid) + return + # child + lock._take_possession() + # First, save our pid in a file for "mailmanctl stop" rendezvous. We + # want the perms on the .pid file to be rw-rw---- + omask = os.umask(6) + try: + fp = open(mm_cfg.PIDFILE, 'w') + print >> fp, os.getpid() + fp.close() + finally: + os.umask(omask) + # Create a new session and become the session leader, but since we + # won't be opening any terminal devices, don't do the ultra-paranoid + # suggestion of doing a second fork after the setsid() call. + os.setsid() + # Instead of cd'ing to root, cd to the Mailman installation home + os.chdir(mm_cfg.PREFIX) + # Clear our file mode creation umask + os.umask(0) + # I don't think we have any unneeded file descriptors. + # + # Now start all the qrunners. This returns a dictionary where the + # keys are qrunner pids and the values are tuples of the following + # form: (qrname, slice, count). This does its own fork and exec, and + # sets up its own signal handlers. + kids = start_all_runners() + # Set up a SIGALRM handler to refresh the lock once per day. The lock + # lifetime is 1day+6hours so this should be plenty. + def sigalrm_handler(signum, frame, lock=lock): + lock.refresh() + signal.alarm(mm_cfg.days(1)) + signal.signal(signal.SIGALRM, sigalrm_handler) + signal.alarm(mm_cfg.days(1)) + # Set up a SIGHUP handler so that if we get one, we'll pass it along + # to all the qrunner children. This will tell them to close and + # reopen their log files + def sighup_handler(signum, frame, kids=kids): + # Closing our syslog will cause it to be re-opened at the next log + # print output. + syslog.close() + for pid in kids.keys(): + os.kill(pid, signal.SIGHUP) + # And just to tweak things... + syslog('qrunner', + 'Master watcher caught SIGHUP. Re-opening log files.') + signal.signal(signal.SIGHUP, sighup_handler) + # We also need to install a SIGTERM handler because that's what init + # will kill this process with when changing run levels. + def sigterm_handler(signum, frame, kids=kids): + for pid in kids.keys(): + try: + os.kill(pid, signal.SIGTERM) + except OSError, e: + if e.errno <> errno.ESRCH: raise + syslog('qrunner', 'Master watcher caught SIGTERM. Exiting.') + signal.signal(signal.SIGTERM, sigterm_handler) + # Finally, we need a SIGINT handler which will cause the sub-qrunners + # to exit, but the master will restart SIGINT'd sub-processes unless + # the -n flag was given. + def sigint_handler(signum, frame, kids=kids): + for pid in kids.keys(): + os.kill(pid, signal.SIGINT) + syslog('qrunner', 'Master watcher caught SIGINT. Restarting.') + signal.signal(signal.SIGINT, sigint_handler) + # Now we're ready to simply do our wait/restart loop. This is the + # master qrunner watcher. + try: + while 1: + try: + pid, status = os.wait() + except OSError, e: + # No children? We're done + if e.errno == errno.ECHILD: + break + # If the system call got interrupted, just restart it. + elif e.errno <> errno.EINTR: + raise + continue + killsig = exitstatus = None + if os.WIFSIGNALED(status): + killsig = os.WTERMSIG(status) + if os.WIFEXITED(status): + exitstatus = os.WEXITSTATUS(status) + # We'll restart the process unless we were given the + # "no-restart" switch, or if the process was SIGTERM'd or + # exitted with a SIGTERM exit status. This lets us better + # handle runaway restarts (say, if the subproc had a syntax + # error!) + restarting = '' + if restart: + if (exitstatus == None and killsig <> signal.SIGTERM) or \ + (killsig == None and exitstatus <> signal.SIGTERM): + # Then + restarting = '[restarting]' + qrname, slice, count, restarts = kids[pid] + del kids[pid] + syslog('qrunner', """\ +Master qrunner detected subprocess exit +(pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""", + pid, killsig, exitstatus, qrname, + slice+1, count, restarting) + # See if we've reached the maximum number of allowable restarts + if exitstatus <> signal.SIGINT: + restarts += 1 + if restarts > MAX_RESTARTS: + syslog('qrunner', """\ +Qrunner %s reached maximum restart limit of %d, not restarting.""", + qrname, MAX_RESTARTS) + restarting = '' + # Now perhaps restart the process unless it exited with a + # SIGTERM or we aren't restarting. + if restarting: + newpid = start_runner(qrname, slice, count) + kids[newpid] = (qrname, slice, count, restarts) + finally: + # Should we leave the main loop for any reason, we want to be sure + # all of our children are exited cleanly. Send SIGTERMs to all + # the child processes and wait for them all to exit. + for pid in kids.keys(): + try: + os.kill(pid, signal.SIGTERM) + except OSError, e: + if e.errno == errno.ESRCH: + # The child has already exited + syslog('qrunner', 'ESRCH on pid: %d', pid) + del kids[pid] + # Wait for all the children to go away + while 1: + try: + pid, status = os.wait() + except OSError, e: + if e.errno == errno.ECHILD: + break + elif e.errno <> errno.EINTR: + raise + continue + # Finally, give up the lock + lock.unlock(unconditionally=1) + os._exit(0) + + + +if __name__ == '__main__': + main() |