diff options
Diffstat (limited to 'Mailman/Archiver')
-rw-r--r-- | Mailman/Archiver/.cvsignore | 1 | ||||
-rw-r--r-- | Mailman/Archiver/Archiver.py | 232 | ||||
-rw-r--r-- | Mailman/Archiver/HyperArch.py | 1224 | ||||
-rw-r--r-- | Mailman/Archiver/HyperDatabase.py | 338 | ||||
-rw-r--r-- | Mailman/Archiver/Makefile.in | 72 | ||||
-rw-r--r-- | Mailman/Archiver/__init__.py | 17 | ||||
-rw-r--r-- | Mailman/Archiver/pipermail.py | 854 |
7 files changed, 2738 insertions, 0 deletions
diff --git a/Mailman/Archiver/.cvsignore b/Mailman/Archiver/.cvsignore new file mode 100644 index 00000000..f3c7a7c5 --- /dev/null +++ b/Mailman/Archiver/.cvsignore @@ -0,0 +1 @@ +Makefile diff --git a/Mailman/Archiver/Archiver.py b/Mailman/Archiver/Archiver.py new file mode 100644 index 00000000..903031cd --- /dev/null +++ b/Mailman/Archiver/Archiver.py @@ -0,0 +1,232 @@ +# Copyright (C) 1998,1999,2000,2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +"""Mixin class for putting new messages in the right place for archival. + +Public archives are separated from private ones. An external archival +mechanism (eg, pipermail) should be pointed to the right places, to do the +archival. +""" + +import os +import errno +import traceback +from cStringIO import StringIO + +from Mailman import mm_cfg +from Mailman import Mailbox +from Mailman import Utils +from Mailman import Site +from Mailman.SafeDict import SafeDict +from Mailman.Logging.Syslog import syslog +from Mailman.i18n import _ + + + +def makelink(old, new): + try: + os.symlink(old, new) + except os.error, e: + code, msg = e + if code <> errno.EEXIST: + raise + +def breaklink(link): + try: + os.unlink(link) + except os.error, e: + code, msg = e + if code <> errno.ENOENT: + raise + + + +class Archiver: + # + # Interface to Pipermail. HyperArch.py uses this method to get the + # archive directory for the mailing list + # + def InitVars(self): + # Configurable + self.archive = mm_cfg.DEFAULT_ARCHIVE + # 0=public, 1=private: + self.archive_private = mm_cfg.DEFAULT_ARCHIVE_PRIVATE + self.archive_volume_frequency = \ + mm_cfg.DEFAULT_ARCHIVE_VOLUME_FREQUENCY + # The archive file structure by default is: + # + # archives/ + # private/ + # listname.mbox/ + # listname.mbox + # listname/ + # lots-of-pipermail-stuff + # public/ + # listname.mbox@ -> ../private/listname.mbox + # listname@ -> ../private/listname + # + # IOW, the mbox and pipermail archives are always stored in the + # private archive for the list. This is safe because archives/private + # is always set to o-rx. Public archives have a symlink to get around + # the private directory, pointing directly to the private/listname + # which has o+rx permissions. Private archives do not have the + # symbolic links. + omask = os.umask(0) + try: + try: + os.mkdir(self.archive_dir()+'.mbox', 02775) + except OSError, e: + if e.errno <> errno.EEXIST: raise + # We also create an empty pipermail archive directory into + # which we'll drop an empty index.html file into. This is so + # that lists that have not yet received a posting have + # /something/ as their index.html, and don't just get a 404. + try: + os.mkdir(self.archive_dir(), 02775) + except OSError, e: + if e.errno <> errno.EEXIST: raise + # See if there's an index.html file there already and if not, + # write in the empty archive notice. + indexfile = os.path.join(self.archive_dir(), 'index.html') + fp = None + try: + fp = open(indexfile) + except IOError, e: + if e.errno <> errno.ENOENT: raise + else: + fp = open(indexfile, 'w') + fp.write(Utils.maketext( + 'emptyarchive.html', + {'listname': self.real_name, + 'listinfo': self.GetScriptURL('listinfo', absolute=1), + }, mlist=self)) + if fp: + fp.close() + finally: + os.umask(omask) + + def archive_dir(self): + return Site.get_archpath(self.internal_name()) + + def ArchiveFileName(self): + """The mbox name where messages are left for archive construction.""" + return os.path.join(self.archive_dir() + '.mbox', + self.internal_name() + '.mbox') + + def GetBaseArchiveURL(self): + if self.archive_private: + return self.GetScriptURL('private', absolute=1) + '/' + else: + inv = {} + for k, v in mm_cfg.VIRTUAL_HOSTS.items(): + inv[v] = k + url = mm_cfg.PUBLIC_ARCHIVE_URL % { + 'listname': self.internal_name(), + 'hostname': inv.get(self.host_name, mm_cfg.DEFAULT_URL_HOST), + } + if not url.endswith('/'): + url += '/' + return url + + def __archive_file(self, afn): + """Open (creating, if necessary) the named archive file.""" + omask = os.umask(002) + try: + return Mailbox.Mailbox(open(afn, 'a+')) + finally: + os.umask(omask) + + # + # old ArchiveMail function, retained under a new name + # for optional archiving to an mbox + # + def __archive_to_mbox(self, post): + """Retain a text copy of the message in an mbox file.""" + try: + afn = self.ArchiveFileName() + mbox = self.__archive_file(afn) + mbox.AppendMessage(post) + mbox.fp.close() + except IOError, msg: + syslog('error', 'Archive file access failure:\n\t%s %s', afn, msg) + raise + + def ExternalArchive(self, ar, txt): + d = SafeDict({'listname': self.internal_name()}) + cmd = ar % d + extarch = os.popen(cmd, 'w') + extarch.write(txt) + status = extarch.close() + if status: + syslog('error', 'external archiver non-zero exit status: %d\n', + (status & 0xff00) >> 8) + + # + # archiving in real time this is called from list.post(msg) + # + def ArchiveMail(self, msg): + """Store postings in mbox and/or pipermail archive, depending.""" + # Fork so archival errors won't disrupt normal list delivery + if mm_cfg.ARCHIVE_TO_MBOX == -1: + return + # + # We don't need an extra archiver lock here because we know the list + # itself must be locked. + if mm_cfg.ARCHIVE_TO_MBOX in (1, 2): + self.__archive_to_mbox(msg) + if mm_cfg.ARCHIVE_TO_MBOX == 1: + # Archive to mbox only. + return + txt = str(msg) + # should we use the internal or external archiver? + private_p = self.archive_private + if mm_cfg.PUBLIC_EXTERNAL_ARCHIVER and not private_p: + self.ExternalArchive(mm_cfg.PUBLIC_EXTERNAL_ARCHIVER, txt) + elif mm_cfg.PRIVATE_EXTERNAL_ARCHIVER and private_p: + self.ExternalArchive(mm_cfg.PRIVATE_EXTERNAL_ARCHIVER, txt) + else: + # use the internal archiver + f = StringIO(txt) + import HyperArch + h = HyperArch.HyperArchive(self) + h.processUnixMailbox(f) + h.close() + f.close() + + # + # called from MailList.MailList.Save() + # + def CheckHTMLArchiveDir(self): + # We need to make sure that the archive directory has the right perms + # for public vs private. If it doesn't exist, or some weird + # permissions errors prevent us from stating the directory, it's + # pointless to try to fix the perms, so we just return -scott + if mm_cfg.ARCHIVE_TO_MBOX == -1: + # Archiving is completely disabled, don't require the skeleton. + return + pubdir = Site.get_archpath(self.internal_name(), public=1) + privdir = self.archive_dir() + pubmbox = pubdir + '.mbox' + privmbox = privdir + '.mbox' + if self.archive_private: + breaklink(pubdir) + breaklink(pubmbox) + else: + # BAW: privdir or privmbox could be nonexistant. We'd get an + # OSError, ENOENT which should be caught and reported properly. + makelink(privdir, pubdir) + makelink(privmbox, pubmbox) diff --git a/Mailman/Archiver/HyperArch.py b/Mailman/Archiver/HyperArch.py new file mode 100644 index 00000000..98fb5738 --- /dev/null +++ b/Mailman/Archiver/HyperArch.py @@ -0,0 +1,1224 @@ +# Copyright (C) 1998,1999,2000,2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +"""HyperArch: Pipermail archiving for Mailman + + - The Dragon De Monsyne <dragondm@integral.org> + + TODO: + - Should be able to force all HTML to be regenerated next time the + archive is run, in case a template is changed. + - Run a command to generate tarball of html archives for downloading + (probably in the 'update_dirty_archives' method). +""" + +from __future__ import nested_scopes + +import sys +import re +import errno +import urllib +import time +import os +import types +import HyperDatabase +import pipermail +import weakref +import binascii + +from email.Header import decode_header, make_header + +from Mailman import mm_cfg +from Mailman import Utils +from Mailman import LockFile +from Mailman import MailList +from Mailman import i18n +from Mailman.SafeDict import SafeDict +from Mailman.Logging.Syslog import syslog +from Mailman.Mailbox import ArchiverMailbox + +# Set up i18n. Assume the current language has already been set in the caller. +_ = i18n._ + +gzip = None +if mm_cfg.GZIP_ARCHIVE_TXT_FILES: + try: + import gzip + except ImportError: + pass + +EMPTYSTRING = '' +NL = '\n' + +# MacOSX has a default stack size that is too small for deeply recursive +# regular expressions. We see this as crashes in the Python test suite when +# running test_re.py and test_sre.py. The fix is to set the stack limit to +# 2048; the general recommendation is to do in the shell before running the +# test suite. But that's inconvenient for a daemon like the qrunner. +# +# AFAIK, this problem only affects the archiver, so we're adding this work +# around to this file (it'll get imported by the bundled pipermail or by the +# bin/arch script. We also only do this on darwin, a.k.a. MacOSX. +if sys.platform == 'darwin': + try: + import resource + except ImportError: + pass + else: + soft, hard = resource.getrlimit(resource.RLIMIT_STACK) + newsoft = min(hard, max(soft, 1024*2048)) + resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard)) + + + +def html_quote(s, lang=None): + repls = ( ('&', '&'), + ("<", '<'), + (">", '>'), + ('"', '"')) + for thing, repl in repls: + s = s.replace(thing, repl) + return Utils.uncanonstr(s, lang) + + +def url_quote(s): + return urllib.quote(s) + + +def null_to_space(s): + return s.replace('\000', ' ') + + +def sizeof(filename, lang): + try: + size = os.path.getsize(filename) + except OSError, e: + # ENOENT can happen if the .mbox file was moved away or deleted, and + # an explicit mbox file name was given to bin/arch. + if e.errno <> errno.ENOENT: raise + return _('size not available') + if size < 1000: + # Avoid i18n side-effects + otrans = i18n.get_translation() + try: + i18n.set_language(lang) + out = _(' %(size)i bytes ') + finally: + i18n.set_translation(otrans) + return out + elif size < 1000000: + return ' %d KB ' % (size / 1000) + # GB?? :-) + return ' %d MB ' % (size / 1000000) + + +html_charset = '<META http-equiv="Content-Type" ' \ + 'content="text/html; charset=%s">' + +def CGIescape(arg, lang=None): + if isinstance(arg, types.UnicodeType): + s = Utils.websafe(arg) + else: + s = Utils.websafe(str(arg)) + return Utils.uncanonstr(s.replace('"', '"'), lang) + +# Parenthesized human name +paren_name_pat = re.compile(r'([(].*[)])') + +# Subject lines preceded with 'Re:' +REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE) + +# E-mail addresses and URLs in text +emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)') + +# Argh! This pattern is buggy, and will choke on URLs with GET parameters. +urlpat = re.compile(r'(\w+://[^>)\s]+)') # URLs in text + +# Blank lines +blankpat = re.compile(r'^\s*$') + +# Starting <html> directive +htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE) +# Ending </html> directive +nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE) +# Match quoted text +quotedpat = re.compile(r'^([>|:]|>)+') + + + +# This doesn't need to be a weakref instance because it's just storing +# strings. Keys are (templatefile, lang) tuples. +_templatecache = {} + +def quick_maketext(templatefile, dict=None, lang=None, mlist=None): + if lang is None: + if mlist is None: + lang = mm_cfg.DEFAULT_SERVER_LANGUAGE + else: + lang = mlist.preferred_language + template = _templatecache.get((templatefile, lang)) + if template is None: + # Use the basic maketext, with defaults to get the raw template + template = Utils.maketext(templatefile, lang=lang, raw=1) + _templatecache[(templatefile, lang)] = template + # Copied from Utils.maketext() + text = template + if dict is not None: + try: + sdict = SafeDict(dict) + try: + text = sdict.interpolate(template) + except UnicodeError: + # Try again after coercing the template to unicode + utemplate = unicode(template, + Utils.GetCharSet(lang), + 'replace') + text = sdict.interpolate(utemplate) + except (TypeError, ValueError): + # The template is really screwed up + pass + # Make sure the text is in the given character set, or html-ify any bogus + # characters. + return Utils.uncanonstr(text, lang) + + + +# Note: I'm overriding most, if not all of the pipermail Article class +# here -ddm +# The Article class encapsulates a single posting. The attributes are: +# +# sequence : Sequence number, unique for each article in a set of archives +# subject : Subject +# datestr : The posting date, in human-readable format +# date : The posting date, in purely numeric format +# fromdate : The posting date, in `unixfrom' format +# headers : Any other headers of interest +# author : The author's name (and possibly organization) +# email : The author's e-mail address +# msgid : A unique message ID +# in_reply_to : If !="", this is the msgid of the article being replied to +# references: A (possibly empty) list of msgid's of earlier articles in +# the thread +# body : A list of strings making up the message body + +class Article(pipermail.Article): + __super_init = pipermail.Article.__init__ + __super_set_date = pipermail.Article._set_date + + _last_article_time = time.time() + + def __init__(self, message=None, sequence=0, keepHeaders=[], + lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None): + self.__super_init(message, sequence, keepHeaders) + self.prev = None + self.next = None + # Trim Re: from the subject line + i = 0 + while i != -1: + result = REpat.match(self.subject) + if result: + i = result.end(0) + self.subject = self.subject[i:] + else: + i = -1 + # Useful to keep around + self._lang = lang + self._mlist = mlist + + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + # Avoid i18n side-effects. Note that the language for this + # article (for this list) could be different from the site-wide + # preferred language, so we need to ensure no side-effects will + # occur. Think what happens when executing bin/arch. + otrans = i18n.get_translation() + try: + i18n.set_language(lang) + self.email = re.sub('@', _(' at '), self.email) + finally: + i18n.set_translation(otrans) + + # Snag the content-* headers. RFC 1521 states that their values are + # case insensitive. + ctype = message.get('Content-Type', 'text/plain') + cenc = message.get('Content-Transfer-Encoding', '') + self.ctype = ctype.lower() + self.cenc = cenc.lower() + self.decoded = {} + charset = message.get_param('charset') + if charset: + charset = charset.lower().strip() + if charset[0]=='"' and charset[-1]=='"': + charset = charset[1:-1] + if charset[0]=="'" and charset[-1]=="'": + charset = charset[1:-1] + try: + body = message.get_payload(decode=1) + except binascii.Error: + body = None + if body and charset != Utils.GetCharSet(self._lang): + # decode body + try: + body = unicode(body, charset) + except (UnicodeError, LookupError): + body = None + if body: + self.body = [l + "\n" for l in body.splitlines()] + + self.decode_headers() + + # Mapping of listnames to MailList instances as a weak value dictionary. + # This code is copied from Runner.py but there's one important operational + # difference. In Runner.py, we always .Load() the MailList object for + # each _dispose() run, otherwise the object retrieved from the cache won't + # be up-to-date. Since we're creating a new HyperArchive instance for + # each message being archived, we don't need to worry about that -- but it + # does mean there are additional opportunities for optimization. + _listcache = weakref.WeakValueDictionary() + + def _open_list(self, listname): + # Cache the open list so that any use of the list within this process + # uses the same object. We use a WeakValueDictionary so that when the + # list is no longer necessary, its memory is freed. + mlist = self._listcache.get(listname) + if not mlist: + try: + mlist = MailList.MailList(listname, lock=0) + except Errors.MMListError, e: + syslog('error', 'error opening list: %s\n%s', listname, e) + return None + else: + self._listcache[listname] = mlist + return mlist + + def __getstate__(self): + d = self.__dict__.copy() + # We definitely don't want to pickle the MailList instance, so just + # pickle a reference to it. + if d.has_key('_mlist'): + mlist = d['_mlist'] + del d['_mlist'] + else: + mlist = None + if mlist: + d['__listname'] = self._mlist.internal_name() + else: + d['__listname'] = None + # Delete a few other things we don't want in the pickle + for attr in ('prev', 'next', 'body'): + if d.has_key(attr): + del d[attr] + d['body'] = [] + return d + + def __setstate__(self, d): + # For loading older Articles via pickle. All this stuff was added + # when Simone Piunni and Tokio Kikuchi i18n'ified Pipermail. See SF + # patch #594771. + self.__dict__ = d + listname = d.get('__listname') + if listname: + del d['__listname'] + d['_mlist'] = self._open_list(listname) + if not d.has_key('_lang'): + if hasattr(self, '_mlist'): + self._lang = self._mlist.preferred_language + else: + self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE + if not d.has_key('cenc'): + self.cenc = None + if not d.has_key('decoded'): + self.decoded = {} + + def setListIfUnset(self, mlist): + if getattr(self, '_mlist', None) is None: + self._mlist = mlist + + def quote(self, buf): + return html_quote(buf, self._lang) + + def decode_headers(self): + """MIME-decode headers. + + If the email, subject, or author attributes contain non-ASCII + characters using the encoded-word syntax of RFC 2047, decoded versions + of those attributes are placed in the self.decoded (a dictionary). + + If the list's charset differs from the header charset, an attempt is + made to decode the headers as Unicode. If that fails, they are left + undecoded. + """ + author = self.decode_charset(self.author) + subject = self.decode_charset(self.subject) + if author: + self.decoded['author'] = author + email = self.decode_charset(self.email) + if email: + self.decoded['email'] = email + if subject: + self.decoded['subject'] = subject + + def decode_charset(self, field): + if field.find("=?") == -1: + return None + # Get the decoded header as a list of (s, charset) tuples + pairs = decode_header(field) + # Use __unicode__() until we can guarantee Python 2.2 + try: + # Use a large number for maxlinelen so it won't get wrapped + h = make_header(pairs, 99999) + return h.__unicode__() + except (UnicodeError, LookupError): + # Unknown encoding + return None + # The last value for c will have the proper charset in it + return EMPTYSTRING.join([s for s, c in pairs]) + + def as_html(self): + d = self.__dict__.copy() + # avoid i18n side-effects + otrans = i18n.get_translation() + i18n.set_language(self._lang) + try: + d["prev"], d["prev_wsubj"] = self._get_prev() + d["next"], d["next_wsubj"] = self._get_next() + + d["email_html"] = self.quote(self.email) + d["title"] = self.quote(self.subject) + d["subject_html"] = self.quote(self.subject) + d["subject_url"] = url_quote(self.subject) + d["in_reply_to_url"] = url_quote(self.in_reply_to) + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + # Point the mailto url back to the list + author = re.sub('@', _(' at '), self.author) + emailurl = self._mlist.GetListEmail() + else: + author = self.author + emailurl = self.email + d["author_html"] = self.quote(author) + d["email_url"] = url_quote(emailurl) + d["datestr_html"] = self.quote(i18n.ctime(int(self.date))) + d["body"] = self._get_body() + d['listurl'] = self._mlist.GetScriptURL('listinfo', absolute=1) + d['listname'] = self._mlist.real_name + d['encoding'] = '' + finally: + i18n.set_translation(otrans) + + charset = Utils.GetCharSet(self._lang) + d["encoding"] = html_charset % charset + + self._add_decoded(d) + return quick_maketext( + 'article.html', d, + lang=self._lang, mlist=self._mlist) + + def _get_prev(self): + """Return the href and subject for the previous message""" + if self.prev: + subject = self._get_subject_enc(self.prev) + prev = ('<LINK REL="Previous" HREF="%s">' + % (url_quote(self.prev.filename))) + prev_wsubj = ('<LI>' + _('Previous message:') + + ' <A HREF="%s">%s\n</A></li>' + % (url_quote(self.prev.filename), + self.quote(subject))) + else: + prev = prev_wsubj = "" + return prev, prev_wsubj + + def _get_subject_enc(self, art): + """Return the subject of art, decoded if possible. + + If the charset of the current message and art match and the + article's subject is encoded, decode it. + """ + return art.decoded.get('subject', art.subject) + + def _get_next(self): + """Return the href and subject for the previous message""" + if self.next: + subject = self._get_subject_enc(self.next) + next = ('<LINK REL="Next" HREF="%s">' + % (url_quote(self.next.filename))) + next_wsubj = ('<LI>' + _('Next message:') + + ' <A HREF="%s">%s\n</A></li>' + % (url_quote(self.next.filename), + self.quote(subject))) + else: + next = next_wsubj = "" + return next, next_wsubj + + _rx_quote = re.compile('=([A-F0-9][A-F0-9])') + _rx_softline = re.compile('=[ \t]*$') + + def _get_body(self): + """Return the message body ready for HTML, decoded if necessary""" + try: + body = self.html_body + except AttributeError: + body = self.body + return null_to_space(EMPTYSTRING.join(body)) + + def _add_decoded(self, d): + """Add encoded-word keys to HTML output""" + for src, dst in (('author', 'author_html'), + ('email', 'email_html'), + ('subject', 'subject_html'), + ('subject', 'title')): + if self.decoded.has_key(src): + d[dst] = self.quote(self.decoded[src]) + + def as_text(self): + d = self.__dict__.copy() + # We need to guarantee a valid From_ line, even if there are + # bososities in the headers. + if not d.get('fromdate', '').strip(): + d['fromdate'] = time.ctime(time.time()) + if not d.get('email', '').strip(): + d['email'] = 'bogus@does.not.exist.com' + if not d.get('datestr', '').strip(): + d['datestr'] = time.ctime(time.time()) + # + headers = ['From %(email)s %(fromdate)s', + 'From: %(email)s (%(author)s)', + 'Date: %(datestr)s', + 'Subject: %(subject)s'] + if d['_in_reply_to']: + headers.append('In-Reply-To: %(_in_reply_to)s') + if d['_references']: + headers.append('References: %(_references)s') + if d['_message_id']: + headers.append('Message-ID: %(_message_id)s') + body = EMPTYSTRING.join(self.body) + if isinstance(body, types.UnicodeType): + body = body.encode(Utils.GetCharSet(self._lang), 'replace') + return NL.join(headers) % d + '\n\n' + body + + def _set_date(self, message): + self.__super_set_date(message) + self.fromdate = time.ctime(int(self.date)) + + def loadbody_fromHTML(self,fileobj): + self.body = [] + begin = 0 + while 1: + line = fileobj.readline() + if not line: + break + if not begin: + if line.strip() == '<!--beginarticle-->': + begin = 1 + continue + if line.strip() == '<!--endarticle-->': + break + self.body.append(line) + + + +class HyperArchive(pipermail.T): + __super_init = pipermail.T.__init__ + __super_update_archive = pipermail.T.update_archive + __super_update_dirty_archives = pipermail.T.update_dirty_archives + __super_add_article = pipermail.T.add_article + + # some defaults + DIRMODE = 02775 + FILEMODE = 0660 + + VERBOSE = 0 + DEFAULTINDEX = 'thread' + ARCHIVE_PERIOD = 'month' + + THREADLAZY = 0 + THREADLEVELS = 3 + + ALLOWHTML = 1 # "Lines between <html></html>" handled as is. + SHOWHTML = 0 # Eg, nuke leading whitespace in html manner. + IQUOTES = 1 # Italicize quoted text. + SHOWBR = 0 # Add <br> onto every line + + def __init__(self, maillist): + # can't init the database while other processes are writing to it! + # XXX TODO- implement native locking + # with mailman's LockFile module for HyperDatabase.HyperDatabase + # + dir = maillist.archive_dir() + db = HyperDatabase.HyperDatabase(dir, maillist) + self.__super_init(dir, reload=1, database=db) + + self.maillist = maillist + self._lock_file = None + self.lang = maillist.preferred_language + self.charset = Utils.GetCharSet(maillist.preferred_language) + + if hasattr(self.maillist,'archive_volume_frequency'): + if self.maillist.archive_volume_frequency == 0: + self.ARCHIVE_PERIOD='year' + elif self.maillist.archive_volume_frequency == 2: + self.ARCHIVE_PERIOD='quarter' + elif self.maillist.archive_volume_frequency == 3: + self.ARCHIVE_PERIOD='week' + elif self.maillist.archive_volume_frequency == 4: + self.ARCHIVE_PERIOD='day' + else: + self.ARCHIVE_PERIOD='month' + + yre = r'(?P<year>[0-9]{4,4})' + mre = r'(?P<month>[01][0-9])' + dre = r'(?P<day>[0123][0-9])' + self._volre = { + 'year': '^' + yre + '$', + 'quarter': '^' + yre + r'q(?P<quarter>[1234])$', + 'month': '^' + yre + r'-(?P<month>[a-zA-Z]+)$', + 'week': r'^Week-of-Mon-' + yre + mre + dre, + 'day': '^' + yre + mre + dre + '$' + } + + def _makeArticle(self, msg, sequence): + return Article(msg, sequence, + lang=self.maillist.preferred_language, + mlist=self.maillist) + + def html_foot(self): + # avoid i18n side-effects + mlist = self.maillist + otrans = i18n.get_translation() + i18n.set_language(mlist.preferred_language) + # Convenience + def quotetime(s): + return html_quote(i18n.ctime(s), self.lang) + try: + d = {"lastdate": quotetime(self.lastdate), + "archivedate": quotetime(self.archivedate), + "listinfo": mlist.GetScriptURL('listinfo', absolute=1), + "version": self.version, + } + i = {"thread": _("thread"), + "subject": _("subject"), + "author": _("author"), + "date": _("date") + } + finally: + i18n.set_translation(otrans) + + for t in i.keys(): + cap = t[0].upper() + t[1:] + if self.type == cap: + d["%s_ref" % (t)] = "" + else: + d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>' + % (t, i[t])) + return quick_maketext( + 'archidxfoot.html', d, + mlist=mlist) + + def html_head(self): + # avoid i18n side-effects + mlist = self.maillist + otrans = i18n.get_translation() + i18n.set_language(mlist.preferred_language) + # Convenience + def quotetime(s): + return html_quote(i18n.ctime(s), self.lang) + try: + d = {"listname": html_quote(mlist.real_name, self.lang), + "archtype": self.type, + "archive": self.volNameToDesc(self.archive), + "listinfo": mlist.GetScriptURL('listinfo', absolute=1), + "firstdate": quotetime(self.firstdate), + "lastdate": quotetime(self.lastdate), + "size": self.size, + } + i = {"thread": _("thread"), + "subject": _("subject"), + "author": _("author"), + "date": _("date"), + } + finally: + i18n.set_translation(otrans) + + for t in i.keys(): + cap = t[0].upper() + t[1:] + if self.type == cap: + d["%s_ref" % (t)] = "" + d["archtype"] = i[t] + else: + d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>' + % (t, i[t])) + if self.charset: + d["encoding"] = html_charset % self.charset + else: + d["encoding"] = "" + return quick_maketext( + 'archidxhead.html', d, + mlist=mlist) + + def html_TOC(self): + mlist = self.maillist + listname = mlist.internal_name() + mbox = os.path.join(mlist.archive_dir()+'.mbox', listname+'.mbox') + d = {"listname": mlist.real_name, + "listinfo": mlist.GetScriptURL('listinfo', absolute=1), + "fullarch": '../%s.mbox/%s.mbox' % (listname, listname), + "size": sizeof(mbox, mlist.preferred_language), + 'meta': '', + } + # Avoid i18n side-effects + otrans = i18n.get_translation() + i18n.set_language(mlist.preferred_language) + try: + if not self.archives: + d["noarchive_msg"] = _( + '<P>Currently, there are no archives. </P>') + d["archive_listing_start"] = "" + d["archive_listing_end"] = "" + d["archive_listing"] = "" + else: + d["noarchive_msg"] = "" + d["archive_listing_start"] = quick_maketext( + 'archliststart.html', + lang=mlist.preferred_language, + mlist=mlist) + d["archive_listing_end"] = quick_maketext( + 'archlistend.html', + mlist=mlist) + + accum = [] + for a in self.archives: + accum.append(self.html_TOC_entry(a)) + d["archive_listing"] = EMPTYSTRING.join(accum) + finally: + i18n.set_translation(otrans) + + # The TOC is always in the charset of the list's preferred language + d['meta'] += html_charset % Utils.GetCharSet(mlist.preferred_language) + + return quick_maketext( + 'archtoc.html', d, + mlist=mlist) + + def html_TOC_entry(self, arch): + # Check to see if the archive is gzip'd or not + txtfile = os.path.join(self.maillist.archive_dir(), arch + '.txt') + gzfile = txtfile + '.gz' + # which exists? .txt.gz first, then .txt + if os.path.exists(gzfile): + file = gzfile + url = arch + '.txt.gz' + templ = '<td><A href="%(url)s">[ ' + _('Gzip\'d Text%(sz)s') \ + + ']</a></td>' + elif os.path.exists(txtfile): + file = txtfile + url = arch + '.txt' + templ = '<td><A href="%(url)s">[ ' + _('Text%(sz)s') + ']</a></td>' + else: + # neither found? + file = None + # in Python 1.5.2 we have an easy way to get the size + if file: + textlink = templ % { + 'url': url, + 'sz' : sizeof(file, self.maillist.preferred_language) + } + else: + # there's no archive file at all... hmmm. + textlink = '' + return quick_maketext( + 'archtocentry.html', + {'archive': arch, + 'archivelabel': self.volNameToDesc(arch), + 'textlink': textlink + }, + mlist=self.maillist) + + def GetArchLock(self): + if self._lock_file: + return 1 + self._lock_file = LockFile.LockFile( + os.path.join(mm_cfg.LOCK_DIR, + self.maillist.internal_name() + '-arch.lock')) + try: + self._lock_file.lock(timeout=0.5) + except LockFile.TimeOutError: + return 0 + return 1 + + def DropArchLock(self): + if self._lock_file: + self._lock_file.unlock(unconditionally=1) + self._lock_file = None + + def processListArch(self): + name = self.maillist.ArchiveFileName() + wname= name+'.working' + ename= name+'.err_unarchived' + try: + os.stat(name) + except (IOError,os.error): + #no archive file, nothin to do -ddm + return + + #see if arch is locked here -ddm + if not self.GetArchLock(): + #another archiver is running, nothing to do. -ddm + return + + #if the working file is still here, the archiver may have + # crashed during archiving. Save it, log an error, and move on. + try: + wf = open(wname) + syslog('error', + 'Archive working file %s present. ' + 'Check %s for possibly unarchived msgs', + wname, ename) + omask = os.umask(007) + try: + ef = open(ename, 'a+') + finally: + os.umask(omask) + ef.seek(1,2) + if ef.read(1) <> '\n': + ef.write('\n') + ef.write(wf.read()) + ef.close() + wf.close() + os.unlink(wname) + except IOError: + pass + os.rename(name,wname) + archfile = open(wname) + self.processUnixMailbox(archfile) + archfile.close() + os.unlink(wname) + self.DropArchLock() + + def get_filename(self, article): + return '%06i.html' % (article.sequence,) + + def get_archives(self, article): + """Return a list of indexes where the article should be filed. + A string can be returned if the list only contains one entry, + and the empty list is legal.""" + res = self.dateToVolName(float(article.date)) + self.message(_("figuring article archives\n")) + self.message(res + "\n") + return res + + def volNameToDesc(self, volname): + volname = volname.strip() + # Don't make these module global constants since we have to runtime + # translate them anyway. + monthdict = [ + '', + _('January'), _('February'), _('March'), _('April'), + _('May'), _('June'), _('July'), _('August'), + _('September'), _('October'), _('November'), _('December') + ] + for each in self._volre.keys(): + match = re.match(self._volre[each], volname) + # Let ValueErrors percolate up + if match: + year = int(match.group('year')) + if each == 'quarter': + d =["", _("First"), _("Second"), _("Third"), _("Fourth") ] + ord = d[int(match.group('quarter'))] + return _("%(ord)s quarter %(year)i") + elif each == 'month': + monthstr = match.group('month').lower() + for i in range(1, 13): + monthname = time.strftime("%B", (1999,i,1,0,0,0,0,1,0)) + if monthstr.lower() == monthname.lower(): + month = monthdict[i] + return _("%(month)s %(year)i") + raise ValueError, "%s is not a month!" % monthstr + elif each == 'week': + month = monthdict[int(match.group("month"))] + day = int(match.group("day")) + return _("The Week Of Monday %(day)i %(month)s %(year)i") + elif each == 'day': + month = monthdict[int(match.group("month"))] + day = int(match.group("day")) + return _("%(day)i %(month)s %(year)i") + else: + return match.group('year') + raise ValueError, "%s is not a valid volname" % volname + +# The following two methods should be inverses of each other. -ddm + + def dateToVolName(self,date): + datetuple=time.localtime(date) + if self.ARCHIVE_PERIOD=='year': + return time.strftime("%Y",datetuple) + elif self.ARCHIVE_PERIOD=='quarter': + if datetuple[1] in [1,2,3]: + return time.strftime("%Yq1",datetuple) + elif datetuple[1] in [4,5,6]: + return time.strftime("%Yq2",datetuple) + elif datetuple[1] in [7,8,9]: + return time.strftime("%Yq3",datetuple) + else: + return time.strftime("%Yq4",datetuple) + elif self.ARCHIVE_PERIOD == 'day': + return time.strftime("%Y%m%d", datetuple) + elif self.ARCHIVE_PERIOD == 'week': + # Reconstruct "seconds since epoch", and subtract weekday + # multiplied by the number of seconds in a day. + monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60 + # Build a new datetuple from this "seconds since epoch" value + datetuple = time.localtime(monday) + return time.strftime("Week-of-Mon-%Y%m%d", datetuple) + # month. -ddm + else: + return time.strftime("%Y-%B",datetuple) + + + def volNameToDate(self,volname): + volname = volname.strip() + for each in self._volre.keys(): + match=re.match(self._volre[each],volname) + if match: + year=int(match.group('year')) + month=1 + day = 1 + if each == 'quarter': + q=int(match.group('quarter')) + month=(q*3)-2 + elif each == 'month': + monthstr=match.group('month').lower() + m=[] + for i in range(1,13): + m.append( + time.strftime("%B",(1999,i,1,0,0,0,0,1,0)).lower()) + try: + month=m.index(monthstr)+1 + except ValueError: + pass + elif each == 'week' or each == 'day': + month = int(match.group("month")) + day = int(match.group("day")) + return time.mktime((year,month,1,0,0,0,0,1,-1)) + return 0.0 + + def sortarchives(self): + def sf(a,b,s=self): + al=s.volNameToDate(a) + bl=s.volNameToDate(b) + if al>bl: + return 1 + elif al<bl: + return -1 + else: + return 0 + if self.ARCHIVE_PERIOD in ('month','year','quarter'): + self.archives.sort(sf) + else: + self.archives.sort() + self.archives.reverse() + + def message(self, msg): + if self.VERBOSE: + f = sys.stderr + f.write(msg) + if msg[-1:] != '\n': + f.write('\n') + f.flush() + + def open_new_archive(self, archive, archivedir): + index_html = os.path.join(archivedir, 'index.html') + try: + os.unlink(index_html) + except: + pass + os.symlink(self.DEFAULTINDEX+'.html',index_html) + + def write_index_header(self): + self.depth=0 + print self.html_head() + if not self.THREADLAZY and self.type=='Thread': + self.message(_("Computing threaded index\n")) + self.updateThreadedIndex() + + def write_index_footer(self): + for i in range(self.depth): + print '</UL>' + print self.html_foot() + + def write_index_entry(self, article): + subject = self.get_header("subject", article) + author = self.get_header("author", article) + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + author = re.sub('@', _(' at '), author) + subject = CGIescape(subject, self.lang) + author = CGIescape(author, self.lang) + + d = { + 'filename': urllib.quote(article.filename), + 'subject': subject, + 'sequence': article.sequence, + 'author': author + } + print quick_maketext( + 'archidxentry.html', d, + mlist=self.maillist) + + def get_header(self, field, article): + # if we have no decoded header, return the encoded one + result = article.decoded.get(field) + if result is None: + return getattr(article, field) + # otherwise, the decoded one will be Unicode + return result + + def write_threadindex_entry(self, article, depth): + if depth < 0: + self.message('depth<0') + depth = 0 + if depth > self.THREADLEVELS: + depth = self.THREADLEVELS + if depth < self.depth: + for i in range(self.depth-depth): + print '</UL>' + elif depth > self.depth: + for i in range(depth-self.depth): + print '<UL>' + print '<!--%i %s -->' % (depth, article.threadKey) + self.depth = depth + self.write_index_entry(article) + + def write_TOC(self): + self.sortarchives() + omask = os.umask(002) + try: + toc = open(os.path.join(self.basedir, 'index.html'), 'w') + finally: + os.umask(omask) + toc.write(self.html_TOC()) + toc.close() + + def write_article(self, index, article, path): + # called by add_article + omask = os.umask(002) + try: + f = open(path, 'w') + finally: + os.umask(omask) + f.write(article.as_html()) + f.close() + + # Write the text article to the text archive. + path = os.path.join(self.basedir, "%s.txt" % index) + omask = os.umask(002) + try: + f = open(path, 'a+') + finally: + os.umask(omask) + f.write(article.as_text()) + f.close() + + def update_archive(self, archive): + self.__super_update_archive(archive) + # only do this if the gzip module was imported globally, and + # gzip'ing was enabled via mm_cfg.GZIP_ARCHIVE_TXT_FILES. See + # above. + if gzip: + archz = None + archt = None + txtfile = os.path.join(self.basedir, '%s.txt' % archive) + gzipfile = os.path.join(self.basedir, '%s.txt.gz' % archive) + oldgzip = os.path.join(self.basedir, '%s.old.txt.gz' % archive) + try: + # open the plain text file + archt = open(txtfile) + except IOError: + return + try: + os.rename(gzipfile, oldgzip) + archz = gzip.open(oldgzip) + except (IOError, RuntimeError, os.error): + pass + try: + ou = os.umask(002) + newz = gzip.open(gzipfile, 'w') + finally: + # XXX why is this a finally? + os.umask(ou) + if archz: + newz.write(archz.read()) + archz.close() + os.unlink(oldgzip) + # XXX do we really need all this in a try/except? + try: + newz.write(archt.read()) + newz.close() + archt.close() + except IOError: + pass + os.unlink(txtfile) + + _skip_attrs = ('maillist', '_lock_file', 'charset') + + def getstate(self): + d={} + for each in self.__dict__.keys(): + if not (each in self._skip_attrs + or each.upper() == each): + d[each] = self.__dict__[each] + return d + + # Add <A HREF="..."> tags around URLs and e-mail addresses. + + def __processbody_URLquote(self, lines): + # XXX a lot to do here: + # 1. use lines directly, rather than source and dest + # 2. make it clearer + # 3. make it faster + source = lines[:] + dest = lines + last_line_was_quoted = 0 + for i in xrange(0, len(source)): + Lorig = L = source[i] + prefix = suffix = "" + if L is None: + continue + # Italicise quoted text + if self.IQUOTES: + quoted = quotedpat.match(L) + if quoted is None: + last_line_was_quoted = 0 + else: + quoted = quoted.end(0) + prefix = CGIescape(L[:quoted], self.lang) + '<i>' + suffix = '</I>' + if self.SHOWHTML: + suffix += '<BR>' + if not last_line_was_quoted: + prefix = '<BR>' + prefix + L = L[quoted:] + last_line_was_quoted = 1 + # Check for an e-mail address + L2 = "" + jr = emailpat.search(L) + kr = urlpat.search(L) + while jr is not None or kr is not None: + if jr == None: + j = -1 + else: + j = jr.start(0) + if kr is None: + k = -1 + else: + k = kr.start(0) + if j != -1 and (j < k or k == -1): + text = jr.group(1) + length = len(text) + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + text = re.sub('@', _(' at '), text) + URL = self.maillist.GetScriptURL( + 'listinfo', absolute=1) + else: + URL = 'mailto:' + text + pos = j + elif k != -1 and (j > k or j == -1): + text = URL = kr.group(1) + length = len(text) + pos = k + else: # j==k + raise ValueError, "j==k: This can't happen!" + #length = len(text) + #self.message("URL: %s %s %s \n" + # % (CGIescape(L[:pos]), URL, CGIescape(text))) + L2 += '%s<A HREF="%s">%s</A>' % ( + CGIescape(L[:pos], self.lang), + html_quote(URL), CGIescape(text, self.lang)) + L = L[pos+length:] + jr = emailpat.search(L) + kr = urlpat.search(L) + if jr is None and kr is None: + L = CGIescape(L, self.lang) + L = prefix + L2 + L + suffix + source[i] = None + dest[i] = L + + # Perform Hypermail-style processing of <HTML></HTML> directives + # in message bodies. Lines between <HTML> and </HTML> will be written + # out precisely as they are; other lines will be passed to func2 + # for further processing . + + def __processbody_HTML(self, lines): + # XXX need to make this method modify in place + source = lines[:] + dest = lines + l = len(source) + i = 0 + while i < l: + while i < l and htmlpat.match(source[i]) is None: + i = i + 1 + if i < l: + source[i] = None + i = i + 1 + while i < l and nohtmlpat.match(source[i]) is None: + dest[i], source[i] = source[i], None + i = i + 1 + if i < l: + source[i] = None + i = i + 1 + + def format_article(self, article): + # called from add_article + # TBD: Why do the HTML formatting here and keep it in the + # pipermail database? It makes more sense to do the html + # formatting as the article is being written as html and toss + # the data after it has been written to the archive file. + lines = filter(None, article.body) + # Handle <HTML> </HTML> directives + if self.ALLOWHTML: + self.__processbody_HTML(lines) + self.__processbody_URLquote(lines) + if not self.SHOWHTML and lines: + lines.insert(0, '<PRE>') + lines.append('</PRE>') + else: + # Do fancy formatting here + if self.SHOWBR: + lines = map(lambda x:x + "<BR>", lines) + else: + for i in range(0, len(lines)): + s = lines[i] + if s[0:1] in ' \t\n': + lines[i] = '<P>' + s + article.html_body = lines + return article + + def update_article(self, arcdir, article, prev, next): + seq = article.sequence + filename = os.path.join(arcdir, article.filename) + self.message(_('Updating HTML for article %(seq)s')) + try: + f = open(filename) + article.loadbody_fromHTML(f) + f.close() + except IOError, e: + if e.errno <> errno.ENOENT: raise + self.message(_('article file %(filename)s is missing!')) + article.prev = prev + article.next = next + omask = os.umask(002) + try: + f = open(filename, 'w') + finally: + os.umask(omask) + f.write(article.as_html()) + f.close() diff --git a/Mailman/Archiver/HyperDatabase.py b/Mailman/Archiver/HyperDatabase.py new file mode 100644 index 00000000..ab41b824 --- /dev/null +++ b/Mailman/Archiver/HyperDatabase.py @@ -0,0 +1,338 @@ +# Copyright (C) 1998,1999,2000,2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# +# site modules +# +import os +import marshal +import time +import errno + +# +# package/project modules +# +import pipermail +from Mailman import LockFile + +CACHESIZE = pipermail.CACHESIZE + +try: + import cPickle + pickle = cPickle +except ImportError: + import pickle + +# +# we're using a python dict in place of +# of bsddb.btree database. only defining +# the parts of the interface used by class HyperDatabase +# only one thing can access this at a time. +# +class DumbBTree: + """Stores pickles of Article objects + + This dictionary-like object stores pickles of all the Article + objects. The object itself is stored using marshal. It would be + much simpler, and probably faster, to store the actual objects in + the DumbBTree and pickle it. + + TBD: Also needs a more sensible name, like IteratableDictionary or + SortedDictionary. + """ + + def __init__(self, path): + self.current_index = 0 + self.path = path + self.lockfile = LockFile.LockFile(self.path + ".lock") + self.lock() + self.__dirty = 0 + self.dict = {} + self.sorted = [] + self.load() + + def __repr__(self): + return "DumbBTree(%s)" % self.path + + def __sort(self, dirty=None): + if self.__dirty == 1 or dirty: + self.sorted = self.dict.keys() + self.sorted.sort() + self.__dirty = 0 + + def lock(self): + self.lockfile.lock() + + def unlock(self): + try: + self.lockfile.unlock() + except LockFile.NotLockedError: + pass + + def __delitem__(self, item): + # if first hasn't been called, we can skip the sort + if self.current_index == 0: + del self.dict[item] + self.__dirty = 1 + return + try: + ci = self.sorted[self.current_index] + except IndexError: + ci = None + if ci == item: + try: + ci = self.sorted[self.current_index + 1] + except IndexError: + ci = None + del self.dict[item] + self.__sort(dirty=1) + if ci is not None: + self.current_index = self.sorted.index(ci) + else: + self.current_index = self.current_index + 1 + + def clear(self): + # bulk clearing much faster than deleting each item, esp. with the + # implementation of __delitem__() above :( + self.dict = {} + + def first(self): + self.__sort() # guarantee that the list is sorted + if not self.sorted: + raise KeyError + else: + key = self.sorted[0] + self.current_index = 1 + return key, self.dict[key] + + def last(self): + if not self.sorted: + raise KeyError + else: + key = self.sorted[-1] + self.current_index = len(self.sorted) - 1 + return key, self.dict[key] + + def next(self): + try: + key = self.sorted[self.current_index] + except IndexError: + raise KeyError + self.current_index = self.current_index + 1 + return key, self.dict[key] + + def has_key(self, key): + return self.dict.has_key(key) + + def set_location(self, loc): + if not self.dict.has_key(loc): + raise KeyError + self.current_index = self.sorted.index(loc) + + def __getitem__(self, item): + return self.dict[item] + + def __setitem__(self, item, val): + # if first hasn't been called, then we don't need to worry + # about sorting again + if self.current_index == 0: + self.dict[item] = val + self.__dirty = 1 + return + try: + current_item = self.sorted[self.current_index] + except IndexError: + current_item = item + self.dict[item] = val + self.__sort(dirty=1) + self.current_index = self.sorted.index(current_item) + + def __len__(self): + return len(self.sorted) + + def load(self): + try: + fp = open(self.path) + try: + self.dict = marshal.load(fp) + finally: + fp.close() + except IOError, e: + if e.errno <> errno.ENOENT: raise + pass + except EOFError: + pass + else: + self.__sort(dirty=1) + + def close(self): + omask = os.umask(007) + try: + fp = open(self.path, 'w') + finally: + os.umask(omask) + fp.write(marshal.dumps(self.dict)) + fp.close() + self.unlock() + + +# this is lifted straight out of pipermail with +# the bsddb.btree replaced with above class. +# didn't use inheritance because of all the +# __internal stuff that needs to be here -scott +# +class HyperDatabase(pipermail.Database): + __super_addArticle = pipermail.Database.addArticle + + def __init__(self, basedir, mlist): + self.__cache = {} + self.__currentOpenArchive = None # The currently open indices + self._mlist = mlist + self.basedir = os.path.expanduser(basedir) + # Recently added articles, indexed only by message ID + self.changed={} + + def firstdate(self, archive): + self.__openIndices(archive) + date = 'None' + try: + datekey, msgid = self.dateIndex.first() + date = time.asctime(time.localtime(float(datekey[0]))) + except KeyError: + pass + return date + + def lastdate(self, archive): + self.__openIndices(archive) + date = 'None' + try: + datekey, msgid = self.dateIndex.last() + date = time.asctime(time.localtime(float(datekey[0]))) + except KeyError: + pass + return date + + def numArticles(self, archive): + self.__openIndices(archive) + return len(self.dateIndex) + + def addArticle(self, archive, article, subject=None, author=None, + date=None): + self.__openIndices(archive) + self.__super_addArticle(archive, article, subject, author, date) + + def __openIndices(self, archive): + if self.__currentOpenArchive == archive: + return + self.__closeIndices() + arcdir = os.path.join(self.basedir, 'database') + omask = os.umask(0) + try: + try: + os.mkdir(arcdir, 02770) + except OSError, e: + if e.errno <> errno.EEXIST: raise + finally: + os.umask(omask) + for i in ('date', 'author', 'subject', 'article', 'thread'): + t = DumbBTree(os.path.join(arcdir, archive + '-' + i)) + setattr(self, i + 'Index', t) + self.__currentOpenArchive = archive + + def __closeIndices(self): + for i in ('date', 'author', 'subject', 'thread', 'article'): + attr = i + 'Index' + if hasattr(self, attr): + index = getattr(self, attr) + if i == 'article': + if not hasattr(self, 'archive_length'): + self.archive_length = {} + l = len(index) + self.archive_length[self.__currentOpenArchive] = l + index.close() + delattr(self, attr) + self.__currentOpenArchive = None + + def close(self): + self.__closeIndices() + + def hasArticle(self, archive, msgid): + self.__openIndices(archive) + return self.articleIndex.has_key(msgid) + + def setThreadKey(self, archive, key, msgid): + self.__openIndices(archive) + self.threadIndex[key]=msgid + + def getArticle(self, archive, msgid): + self.__openIndices(archive) + if not self.__cache.has_key(msgid): + # get the pickled object out of the DumbBTree + buf = self.articleIndex[msgid] + article = self.__cache[msgid] = pickle.loads(buf) + # For upgrading older archives + article.setListIfUnset(self._mlist) + else: + article = self.__cache[msgid] + return article + + def first(self, archive, index): + self.__openIndices(archive) + index = getattr(self, index + 'Index') + try: + key, msgid = index.first() + return msgid + except KeyError: + return None + + def next(self, archive, index): + self.__openIndices(archive) + index = getattr(self, index + 'Index') + try: + key, msgid = index.next() + return msgid + except KeyError: + return None + + def getOldestArticle(self, archive, subject): + self.__openIndices(archive) + subject = subject.lower() + try: + key, tempid=self.subjectIndex.set_location(subject) + self.subjectIndex.next() + [subject2, date]= key.split('\0') + if subject!=subject2: return None + return tempid + except KeyError: + return None + + def newArchive(self, archive): + pass + + def clearIndex(self, archive, index): + self.__openIndices(archive) + if hasattr(self.threadIndex, 'clear'): + self.threadIndex.clear() + return + finished=0 + try: + key, msgid=self.threadIndex.first() + except KeyError: finished=1 + while not finished: + del self.threadIndex[key] + try: + key, msgid=self.threadIndex.next() + except KeyError: finished=1 diff --git a/Mailman/Archiver/Makefile.in b/Mailman/Archiver/Makefile.in new file mode 100644 index 00000000..fe56149d --- /dev/null +++ b/Mailman/Archiver/Makefile.in @@ -0,0 +1,72 @@ +# Copyright (C) 1998,1999,2000,2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# NOTE: Makefile.in is converted into Makefile by the configure script +# in the parent directory. Once configure has run, you can recreate +# the Makefile by running just config.status. + +# Variables set by configure + +VPATH= @srcdir@ +srcdir= @srcdir@ +bindir= @bindir@ +prefix= @prefix@ +exec_prefix= @exec_prefix@ + +CC= @CC@ +CHMOD= @CHMOD@ +INSTALL= @INSTALL@ + +DEFS= @DEFS@ + +# Customizable but not set by configure + +OPT= @OPT@ +CFLAGS= $(OPT) $(DEFS) +PACKAGEDIR= $(prefix)/Mailman/Archiver +SHELL= /bin/sh + +MODULES= __init__.py Archiver.py HyperArch.py HyperDatabase.py \ +pipermail.py + + +# Modes for directories and executables created by the install +# process. Default to group-writable directories but +# user-only-writable for executables. +DIRMODE= 775 +EXEMODE= 755 +FILEMODE= 644 +INSTALL_PROGRAM=$(INSTALL) -m $(EXEMODE) + + +# Rules + +all: + +install: + for f in $(MODULES); \ + do \ + $(INSTALL) -m $(FILEMODE) $(srcdir)/$$f $(PACKAGEDIR); \ + done + +finish: + +clean: + +distclean: + -rm *.pyc + -rm Makefile + diff --git a/Mailman/Archiver/__init__.py b/Mailman/Archiver/__init__.py new file mode 100644 index 00000000..65ad7be7 --- /dev/null +++ b/Mailman/Archiver/__init__.py @@ -0,0 +1,17 @@ +# Copyright (C) 1998,1999,2000,2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from Archiver import * diff --git a/Mailman/Archiver/pipermail.py b/Mailman/Archiver/pipermail.py new file mode 100644 index 00000000..2e1b226d --- /dev/null +++ b/Mailman/Archiver/pipermail.py @@ -0,0 +1,854 @@ +#! /usr/bin/env python + +from __future__ import nested_scopes + +import mailbox +import os +import re +import sys +import time +from email.Utils import parseaddr, parsedate_tz +import cPickle as pickle +from cStringIO import StringIO +from string import lowercase + +__version__ = '0.09 (Mailman edition)' +VERSION = __version__ +CACHESIZE = 100 # Number of slots in the cache + +from Mailman import Errors +from Mailman.Mailbox import ArchiverMailbox +from Mailman.Logging.Syslog import syslog +from Mailman.i18n import _ + +SPACE = ' ' + + + +msgid_pat = re.compile(r'(<.*>)') +def strip_separators(s): + "Remove quotes or parenthesization from a Message-ID string" + if not s: + return "" + if s[0] in '"<([' and s[-1] in '">)]': + s = s[1:-1] + return s + +smallNameParts = ['van', 'von', 'der', 'de'] + +def fixAuthor(author): + "Canonicalize a name into Last, First format" + # If there's a comma, guess that it's already in "Last, First" format + if ',' in author: + return author + L = author.split() + i = len(L) - 1 + if i == 0: + return author # The string's one word--forget it + if author.upper() == author or author.lower() == author: + # Damn, the name is all upper- or lower-case. + while i > 0 and L[i-1].lower() in smallNameParts: + i = i - 1 + else: + # Mixed case; assume that small parts of the last name will be + # in lowercase, and check them against the list. + while i>0 and (L[i-1][0] in lowercase or + L[i-1].lower() in smallNameParts): + i = i - 1 + author = SPACE.join(L[-1:] + L[i:-1]) + ', ' + SPACE.join(L[:i]) + return author + +# Abstract class for databases + +class DatabaseInterface: + def __init__(self): pass + def close(self): pass + def getArticle(self, archive, msgid): pass + def hasArticle(self, archive, msgid): pass + def addArticle(self, archive, article, subject=None, author=None, + date=None): pass + def firstdate(self, archive): pass + def lastdate(self, archive): pass + def first(self, archive, index): pass + def next(self, archive, index): pass + def numArticles(self, archive): pass + def newArchive(self, archive): pass + def setThreadKey(self, archive, key, msgid): pass + def getOldestArticle(self, subject): pass + +class Database(DatabaseInterface): + """Define the basic sorting logic for a database + + Assumes that the database internally uses dateIndex, authorIndex, + etc. + """ + + # TBD Factor out more of the logic shared between BSDDBDatabase + # and HyperDatabase and place it in this class. + + def __init__(self): + # This method need not be called by subclasses that do their + # own initialization. + self.dateIndex = {} + self.authorIndex = {} + self.subjectIndex = {} + self.articleIndex = {} + self.changed = {} + + def addArticle(self, archive, article, subject=None, author=None, + date=None): + # create the keys; always end w/ msgid which will be unique + authorkey = (author or article.author, article.date, + article.msgid) + subjectkey = (subject or article.subject, article.date, + article.msgid) + datekey = date or article.date, article.msgid + + # Add the new article + self.dateIndex[datekey] = article.msgid + self.authorIndex[authorkey] = article.msgid + self.subjectIndex[subjectkey] = article.msgid + + self.store_article(article) + self.changed[archive, article.msgid] = None + + parentID = article.parentID + if parentID is not None and self.articleIndex.has_key(parentID): + parent = self.getArticle(archive, parentID) + myThreadKey = parent.threadKey + article.date + '-' + else: + myThreadKey = article.date + '-' + article.threadKey = myThreadKey + key = myThreadKey, article.msgid + self.setThreadKey(archive, key, article.msgid) + + def store_article(self, article): + """Store article without message body to save space""" + # TBD this is not thread safe! + temp = article.body + article.body = [] + self.articleIndex[article.msgid] = pickle.dumps(article) + article.body = temp + +# The Article class encapsulates a single posting. The attributes +# are: +# +# sequence : Sequence number, unique for each article in a set of archives +# subject : Subject +# datestr : The posting date, in human-readable format +# date : The posting date, in purely numeric format +# headers : Any other headers of interest +# author : The author's name (and possibly organization) +# email : The author's e-mail address +# msgid : A unique message ID +# in_reply_to: If != "", this is the msgid of the article being replied to +# references : A (possibly empty) list of msgid's of earlier articles +# in the thread +# body : A list of strings making up the message body + +class Article: + _last_article_time = time.time() + + def __init__(self, message = None, sequence = 0, keepHeaders = []): + if message is None: + return + self.sequence = sequence + + self.parentID = None + self.threadKey = None + # otherwise the current sequence number is used. + id = strip_separators(message['Message-Id']) + if id == "": + self.msgid = str(self.sequence) + else: self.msgid = id + + if message.has_key('Subject'): + self.subject = str(message['Subject']) + else: + self.subject = _('No subject') + if self.subject == "": self.subject = _('No subject') + + self._set_date(message) + + # Figure out the e-mail address and poster's name. Use the From: + # field first, followed by Reply-To: + self.author, self.email = parseaddr(message.get('From', '')) + e = message['Reply-To'] + if not self.email and e is not None: + ignoreauthor, self.email = parseaddr(e) + self.email = strip_separators(self.email) + self.author = strip_separators(self.author) + + if self.author == "": + self.author = self.email + + # Save the In-Reply-To:, References:, and Message-ID: lines + # + # TBD: The original code does some munging on these fields, which + # shouldn't be necessary, but changing this may break code. For + # safety, I save the original headers on different attributes for use + # in writing the plain text periodic flat files. + self._in_reply_to = message['in-reply-to'] + self._references = message['references'] + self._message_id = message['message-id'] + + i_r_t = message['In-Reply-To'] + if i_r_t is None: + self.in_reply_to = '' + else: + match = msgid_pat.search(i_r_t) + if match is None: self.in_reply_to = '' + else: self.in_reply_to = strip_separators(match.group(1)) + + references = message['References'] + if references is None: + self.references = [] + else: + self.references = map(strip_separators, references.split()) + + # Save any other interesting headers + self.headers = {} + for i in keepHeaders: + if message.has_key(i): + self.headers[i] = message[i] + + # Read the message body + s = StringIO(message.get_payload()) + self.body = s.readlines() + + def _set_date(self, message): + def floatdate(header): + missing = [] + datestr = message.get(header, missing) + if datestr is missing: + return None + date = parsedate_tz(datestr) + try: + return time.mktime(date[:9]) + except (ValueError, OverflowError): + return None + date = floatdate('date') + if date is None: + date = floatdate('x-list-received-date') + if date is None: + # What's left to try? + date = self._last_article_time + 1 + self._last_article_time = date + self.date = '%011i' % date + + def __repr__(self): + return '<Article ID = '+repr(self.msgid)+'>' + +# Pipermail formatter class + +class T: + DIRMODE = 0755 # Mode to give to created directories + FILEMODE = 0644 # Mode to give to created files + INDEX_EXT = ".html" # Extension for indexes + + def __init__(self, basedir = None, reload = 1, database = None): + # If basedir isn't provided, assume the current directory + if basedir is None: + self.basedir = os.getcwd() + else: + basedir = os.path.expanduser(basedir) + self.basedir = basedir + self.database = database + + # If the directory doesn't exist, create it. This code shouldn't get + # run anymore, we create the directory in Archiver.py. It should only + # get used by legacy lists created that are only receiving their first + # message in the HTML archive now -- Marc + try: + os.stat(self.basedir) + except os.error, errdata: + errno, errmsg = errdata + if errno != 2: + raise os.error, errdata + else: + self.message(_('Creating archive directory ') + self.basedir) + omask = os.umask(0) + try: + os.mkdir(self.basedir, self.DIRMODE) + finally: + os.umask(omask) + + # Try to load previously pickled state + try: + if not reload: + raise IOError + f = open(os.path.join(self.basedir, 'pipermail.pck'), 'r') + self.message(_('Reloading pickled archive state')) + d = pickle.load(f) + f.close() + for key, value in d.items(): + setattr(self, key, value) + except (IOError, EOFError): + # No pickled version, so initialize various attributes + self.archives = [] # Archives + self._dirty_archives = [] # Archives that will have to be updated + self.sequence = 0 # Sequence variable used for + # numbering articles + self.update_TOC = 0 # Does the TOC need updating? + # + # make the basedir variable work when passed in as an __init__ arg + # and different from the one in the pickle. Let the one passed in + # as an __init__ arg take precedence if it's stated. This way, an + # archive can be moved from one place to another and still work. + # + if basedir != self.basedir: + self.basedir = basedir + + def close(self): + "Close an archive, save its state, and update any changed archives." + self.update_dirty_archives() + self.update_TOC = 0 + self.write_TOC() + # Save the collective state + self.message(_('Pickling archive state into ') + + os.path.join(self.basedir, 'pipermail.pck')) + self.database.close() + del self.database + + omask = os.umask(007) + try: + f = open(os.path.join(self.basedir, 'pipermail.pck'), 'w') + finally: + os.umask(omask) + pickle.dump(self.getstate(), f) + f.close() + + def getstate(self): + # can override this in subclass + return self.__dict__ + + # + # Private methods + # + # These will be neither overridden nor called by custom archivers. + # + + + # Create a dictionary of various parameters that will be passed + # to the write_index_{header,footer} functions + def __set_parameters(self, archive): + # Determine the earliest and latest date in the archive + firstdate = self.database.firstdate(archive) + lastdate = self.database.lastdate(archive) + + # Get the current time + now = time.asctime(time.localtime(time.time())) + self.firstdate = firstdate + self.lastdate = lastdate + self.archivedate = now + self.size = self.database.numArticles(archive) + self.archive = archive + self.version = __version__ + + # Find the message ID of an article's parent, or return None + # if no parent can be found. + + def __findParent(self, article, children = []): + parentID = None + if article.in_reply_to: + parentID = article.in_reply_to + elif article.references: + # Remove article IDs that aren't in the archive + refs = filter(self.articleIndex.has_key, article.references) + if not refs: + return None + maxdate = self.database.getArticle(self.archive, + refs[0]) + for ref in refs[1:]: + a = self.database.getArticle(self.archive, ref) + if a.date > maxdate.date: + maxdate = a + parentID = maxdate.msgid + else: + # Look for the oldest matching subject + try: + key, tempid = \ + self.subjectIndex.set_location(article.subject) + print key, tempid + self.subjectIndex.next() + [subject, date] = key.split('\0') + print article.subject, subject, date + if subject == article.subject and tempid not in children: + parentID = tempid + except KeyError: + pass + return parentID + + # Update the threaded index completely + def updateThreadedIndex(self): + # Erase the threaded index + self.database.clearIndex(self.archive, 'thread') + + # Loop over all the articles + msgid = self.database.first(self.archive, 'date') + while msgid is not None: + try: + article = self.database.getArticle(self.archive, msgid) + except KeyError: + pass + else: + if article.parentID is None or \ + not self.database.hasArticle(self.archive, + article.parentID): + # then + pass + else: + parent = self.database.getArticle(self.archive, + article.parentID) + article.threadKey = parent.threadKey+article.date+'-' + self.database.setThreadKey(self.archive, + (article.threadKey, article.msgid), + msgid) + msgid = self.database.next(self.archive, 'date') + + # + # Public methods: + # + # These are part of the public interface of the T class, but will + # never be overridden (unless you're trying to do something very new). + + # Update a single archive's indices, whether the archive's been + # dirtied or not. + def update_archive(self, archive): + self.archive = archive + self.message(_("Updating index files for archive [%(archive)s]")) + arcdir = os.path.join(self.basedir, archive) + self.__set_parameters(archive) + + for hdr in ('Date', 'Subject', 'Author'): + self._update_simple_index(hdr, archive, arcdir) + + self._update_thread_index(archive, arcdir) + + def _update_simple_index(self, hdr, archive, arcdir): + self.message(" " + hdr) + self.type = hdr + hdr = hdr.lower() + + self._open_index_file_as_stdout(arcdir, hdr) + self.write_index_header() + count = 0 + # Loop over the index entries + msgid = self.database.first(archive, hdr) + while msgid is not None: + try: + article = self.database.getArticle(self.archive, msgid) + except KeyError: + pass + else: + count = count + 1 + self.write_index_entry(article) + msgid = self.database.next(archive, hdr) + # Finish up this index + self.write_index_footer() + self._restore_stdout() + + def _update_thread_index(self, archive, arcdir): + self.message(_(" Thread")) + self._open_index_file_as_stdout(arcdir, "thread") + self.type = 'Thread' + self.write_index_header() + + # To handle the prev./next in thread pointers, we need to + # track articles 5 at a time. + + # Get the first 5 articles + L = [None] * 5 + i = 2 + msgid = self.database.first(self.archive, 'thread') + + while msgid is not None and i < 5: + L[i] = self.database.getArticle(self.archive, msgid) + i = i + 1 + msgid = self.database.next(self.archive, 'thread') + + while L[2] is not None: + article = L[2] + artkey = None + if article is not None: + artkey = article.threadKey + if artkey is not None: + self.write_threadindex_entry(article, artkey.count('-') - 1) + if self.database.changed.has_key((archive,article.msgid)): + a1 = L[1] + a3 = L[3] + self.update_article(arcdir, article, a1, a3) + if a3 is not None: + self.database.changed[(archive, a3.msgid)] = None + if a1 is not None: + key = archive, a1.msgid + if not self.database.changed.has_key(key): + self.update_article(arcdir, a1, L[0], L[2]) + else: + del self.database.changed[key] + L = L[1:] # Rotate the list + if msgid is None: + L.append(msgid) + else: + L.append(self.database.getArticle(self.archive, msgid)) + msgid = self.database.next(self.archive, 'thread') + + self.write_index_footer() + self._restore_stdout() + + def _open_index_file_as_stdout(self, arcdir, index_name): + path = os.path.join(arcdir, index_name + self.INDEX_EXT) + omask = os.umask(002) + try: + self.__f = open(path, 'w') + finally: + os.umask(omask) + self.__stdout = sys.stdout + sys.stdout = self.__f + + def _restore_stdout(self): + sys.stdout = self.__stdout + self.__f.close() + del self.__f + del self.__stdout + + # Update only archives that have been marked as "changed". + def update_dirty_archives(self): + for i in self._dirty_archives: + self.update_archive(i) + self._dirty_archives = [] + + # Read a Unix mailbox file from the file object <input>, + # and create a series of Article objects. Each article + # object will then be archived. + + def _makeArticle(self, msg, sequence): + return Article(msg, sequence) + + def processUnixMailbox(self, input, start=None, end=None): + mbox = ArchiverMailbox(input, self.maillist) + if start is None: + start = 0 + counter = 0 + while counter < start: + try: + m = mbox.next() + except Errors.DiscardMessage: + continue + if m is None: + return + counter += 1 + while 1: + try: + pos = input.tell() + m = mbox.next() + except Errors.DiscardMessage: + continue + except Exception: + syslog('error', 'uncaught archiver exception at filepos: %s', + pos) + raise + if m is None: + break + if m == '': + # It was an unparseable message + continue + msgid = m.get('message-id', 'n/a') + self.message(_('#%(counter)05d %(msgid)s')) + a = self._makeArticle(m, self.sequence) + self.sequence += 1 + self.add_article(a) + if end is not None and counter >= end: + break + counter += 1 + + def new_archive(self, archive, archivedir): + self.archives.append(archive) + self.update_TOC = 1 + self.database.newArchive(archive) + # If the archive directory doesn't exist, create it + try: + os.stat(archivedir) + except os.error, errdata: + errno, errmsg = errdata + if errno == 2: + omask = os.umask(0) + try: + os.mkdir(archivedir, self.DIRMODE) + finally: + os.umask(omask) + else: + raise os.error, errdata + self.open_new_archive(archive, archivedir) + + def add_article(self, article): + archives = self.get_archives(article) + if not archives: + return + if type(archives) == type(''): + archives = [archives] + + article.filename = filename = self.get_filename(article) + temp = self.format_article(article) + for arch in archives: + self.archive = arch # why do this??? + archivedir = os.path.join(self.basedir, arch) + if arch not in self.archives: + self.new_archive(arch, archivedir) + + # Write the HTML-ized article + self.write_article(arch, temp, os.path.join(archivedir, + filename)) + + author = fixAuthor(article.author) + subject = article.subject.lower() + + article.parentID = parentID = self.get_parent_info(arch, article) + if parentID: + parent = self.database.getArticle(arch, parentID) + article.threadKey = parent.threadKey + article.date + '-' + else: + article.threadKey = article.date + '-' + key = article.threadKey, article.msgid + + self.database.setThreadKey(arch, key, article.msgid) + self.database.addArticle(arch, temp, author=author, + subject=subject) + + if arch not in self._dirty_archives: + self._dirty_archives.append(arch) + + def get_parent_info(self, archive, article): + parentID = None + if article.in_reply_to: + parentID = article.in_reply_to + elif article.references: + refs = self._remove_external_references(article.references) + if refs: + maxdate = self.database.getArticle(archive, refs[0]) + for ref in refs[1:]: + a = self.database.getArticle(archive, ref) + if a.date > maxdate.date: + maxdate = a + parentID = maxdate.msgid + else: + # Get the oldest article with a matching subject, and + # assume this is a follow-up to that article + parentID = self.database.getOldestArticle(archive, + article.subject) + + if parentID and not self.database.hasArticle(archive, parentID): + parentID = None + return parentID + + def write_article(self, index, article, path): + omask = os.umask(002) + try: + f = open(path, 'w') + finally: + os.umask(omask) + temp_stdout, sys.stdout = sys.stdout, f + self.write_article_header(article) + sys.stdout.writelines(article.body) + self.write_article_footer(article) + sys.stdout = temp_stdout + f.close() + + def _remove_external_references(self, refs): + keep = [] + for ref in refs: + if self.database.hasArticle(self.archive, ref): + keep.append(ref) + return keep + + # Abstract methods: these will need to be overridden by subclasses + # before anything useful can be done. + + def get_filename(self, article): + pass + def get_archives(self, article): + """Return a list of indexes where the article should be filed. + A string can be returned if the list only contains one entry, + and the empty list is legal.""" + pass + def format_article(self, article): + pass + def write_index_header(self): + pass + def write_index_footer(self): + pass + def write_index_entry(self, article): + pass + def write_threadindex_entry(self, article, depth): + pass + def write_article_header(self, article): + pass + def write_article_footer(self, article): + pass + def write_article_entry(self, article): + pass + def update_article(self, archivedir, article, prev, next): + pass + def write_TOC(self): + pass + def open_new_archive(self, archive, dir): + pass + def message(self, msg): + pass + + +class BSDDBdatabase(Database): + __super_addArticle = Database.addArticle + + def __init__(self, basedir): + self.__cachekeys = [] + self.__cachedict = {} + self.__currentOpenArchive = None # The currently open indices + self.basedir = os.path.expanduser(basedir) + self.changed = {} # Recently added articles, indexed only by + # message ID + + def firstdate(self, archive): + self.__openIndices(archive) + date = 'None' + try: + date, msgid = self.dateIndex.first() + date = time.asctime(time.localtime(float(date))) + except KeyError: + pass + return date + + def lastdate(self, archive): + self.__openIndices(archive) + date = 'None' + try: + date, msgid = self.dateIndex.last() + date = time.asctime(time.localtime(float(date))) + except KeyError: + pass + return date + + def numArticles(self, archive): + self.__openIndices(archive) + return len(self.dateIndex) + + def addArticle(self, archive, article, subject=None, author=None, + date=None): + self.__openIndices(archive) + self.__super_addArticle(archive, article, subject, author, date) + + # Open the BSDDB files that are being used as indices + # (dateIndex, authorIndex, subjectIndex, articleIndex) + def __openIndices(self, archive): + if self.__currentOpenArchive == archive: + return + + import bsddb + self.__closeIndices() + arcdir = os.path.join(self.basedir, 'database') + omask = os.umask(0) + try: + try: + os.mkdir(arcdir, 02775) + except OSError: + # BAW: Hmm... + pass + finally: + os.umask(omask) + for hdr in ('date', 'author', 'subject', 'article', 'thread'): + path = os.path.join(arcdir, archive + '-' + hdr) + t = bsddb.btopen(path, 'c') + setattr(self, hdr + 'Index', t) + self.__currentOpenArchive = archive + + # Close the BSDDB files that are being used as indices (if they're + # open--this is safe to call if they're already closed) + def __closeIndices(self): + if self.__currentOpenArchive is not None: + pass + for hdr in ('date', 'author', 'subject', 'thread', 'article'): + attr = hdr + 'Index' + if hasattr(self, attr): + index = getattr(self, attr) + if hdr == 'article': + if not hasattr(self, 'archive_length'): + self.archive_length = {} + self.archive_length[self.__currentOpenArchive] = len(index) + index.close() + delattr(self,attr) + self.__currentOpenArchive = None + + def close(self): + self.__closeIndices() + def hasArticle(self, archive, msgid): + self.__openIndices(archive) + return self.articleIndex.has_key(msgid) + def setThreadKey(self, archive, key, msgid): + self.__openIndices(archive) + self.threadIndex[key] = msgid + def getArticle(self, archive, msgid): + self.__openIndices(archive) + if self.__cachedict.has_key(msgid): + self.__cachekeys.remove(msgid) + self.__cachekeys.append(msgid) + return self.__cachedict[msgid] + if len(self.__cachekeys) == CACHESIZE: + delkey, self.__cachekeys = (self.__cachekeys[0], + self.__cachekeys[1:]) + del self.__cachedict[delkey] + s = self.articleIndex[msgid] + article = pickle.loads(s) + self.__cachekeys.append(msgid) + self.__cachedict[msgid] = article + return article + + def first(self, archive, index): + self.__openIndices(archive) + index = getattr(self, index+'Index') + try: + key, msgid = index.first() + return msgid + except KeyError: + return None + def next(self, archive, index): + self.__openIndices(archive) + index = getattr(self, index+'Index') + try: + key, msgid = index.next() + except KeyError: + return None + else: + return msgid + + def getOldestArticle(self, archive, subject): + self.__openIndices(archive) + subject = subject.lower() + try: + key, tempid = self.subjectIndex.set_location(subject) + self.subjectIndex.next() + [subject2, date] = key.split('\0') + if subject != subject2: + return None + return tempid + except KeyError: # XXX what line raises the KeyError? + return None + + def newArchive(self, archive): + pass + + def clearIndex(self, archive, index): + self.__openIndices(archive) + index = getattr(self, index+'Index') + finished = 0 + try: + key, msgid = self.threadIndex.first() + except KeyError: + finished = 1 + while not finished: + del self.threadIndex[key] + try: + key, msgid = self.threadIndex.next() + except KeyError: + finished = 1 + + |