diff options
Diffstat (limited to '')
-rw-r--r-- | Mailman/Archiver/HyperArch.py | 1224 |
1 files changed, 1224 insertions, 0 deletions
diff --git a/Mailman/Archiver/HyperArch.py b/Mailman/Archiver/HyperArch.py new file mode 100644 index 00000000..98fb5738 --- /dev/null +++ b/Mailman/Archiver/HyperArch.py @@ -0,0 +1,1224 @@ +# Copyright (C) 1998,1999,2000,2001,2002 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +"""HyperArch: Pipermail archiving for Mailman + + - The Dragon De Monsyne <dragondm@integral.org> + + TODO: + - Should be able to force all HTML to be regenerated next time the + archive is run, in case a template is changed. + - Run a command to generate tarball of html archives for downloading + (probably in the 'update_dirty_archives' method). +""" + +from __future__ import nested_scopes + +import sys +import re +import errno +import urllib +import time +import os +import types +import HyperDatabase +import pipermail +import weakref +import binascii + +from email.Header import decode_header, make_header + +from Mailman import mm_cfg +from Mailman import Utils +from Mailman import LockFile +from Mailman import MailList +from Mailman import i18n +from Mailman.SafeDict import SafeDict +from Mailman.Logging.Syslog import syslog +from Mailman.Mailbox import ArchiverMailbox + +# Set up i18n. Assume the current language has already been set in the caller. +_ = i18n._ + +gzip = None +if mm_cfg.GZIP_ARCHIVE_TXT_FILES: + try: + import gzip + except ImportError: + pass + +EMPTYSTRING = '' +NL = '\n' + +# MacOSX has a default stack size that is too small for deeply recursive +# regular expressions. We see this as crashes in the Python test suite when +# running test_re.py and test_sre.py. The fix is to set the stack limit to +# 2048; the general recommendation is to do in the shell before running the +# test suite. But that's inconvenient for a daemon like the qrunner. +# +# AFAIK, this problem only affects the archiver, so we're adding this work +# around to this file (it'll get imported by the bundled pipermail or by the +# bin/arch script. We also only do this on darwin, a.k.a. MacOSX. +if sys.platform == 'darwin': + try: + import resource + except ImportError: + pass + else: + soft, hard = resource.getrlimit(resource.RLIMIT_STACK) + newsoft = min(hard, max(soft, 1024*2048)) + resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard)) + + + +def html_quote(s, lang=None): + repls = ( ('&', '&'), + ("<", '<'), + (">", '>'), + ('"', '"')) + for thing, repl in repls: + s = s.replace(thing, repl) + return Utils.uncanonstr(s, lang) + + +def url_quote(s): + return urllib.quote(s) + + +def null_to_space(s): + return s.replace('\000', ' ') + + +def sizeof(filename, lang): + try: + size = os.path.getsize(filename) + except OSError, e: + # ENOENT can happen if the .mbox file was moved away or deleted, and + # an explicit mbox file name was given to bin/arch. + if e.errno <> errno.ENOENT: raise + return _('size not available') + if size < 1000: + # Avoid i18n side-effects + otrans = i18n.get_translation() + try: + i18n.set_language(lang) + out = _(' %(size)i bytes ') + finally: + i18n.set_translation(otrans) + return out + elif size < 1000000: + return ' %d KB ' % (size / 1000) + # GB?? :-) + return ' %d MB ' % (size / 1000000) + + +html_charset = '<META http-equiv="Content-Type" ' \ + 'content="text/html; charset=%s">' + +def CGIescape(arg, lang=None): + if isinstance(arg, types.UnicodeType): + s = Utils.websafe(arg) + else: + s = Utils.websafe(str(arg)) + return Utils.uncanonstr(s.replace('"', '"'), lang) + +# Parenthesized human name +paren_name_pat = re.compile(r'([(].*[)])') + +# Subject lines preceded with 'Re:' +REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE) + +# E-mail addresses and URLs in text +emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)') + +# Argh! This pattern is buggy, and will choke on URLs with GET parameters. +urlpat = re.compile(r'(\w+://[^>)\s]+)') # URLs in text + +# Blank lines +blankpat = re.compile(r'^\s*$') + +# Starting <html> directive +htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE) +# Ending </html> directive +nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE) +# Match quoted text +quotedpat = re.compile(r'^([>|:]|>)+') + + + +# This doesn't need to be a weakref instance because it's just storing +# strings. Keys are (templatefile, lang) tuples. +_templatecache = {} + +def quick_maketext(templatefile, dict=None, lang=None, mlist=None): + if lang is None: + if mlist is None: + lang = mm_cfg.DEFAULT_SERVER_LANGUAGE + else: + lang = mlist.preferred_language + template = _templatecache.get((templatefile, lang)) + if template is None: + # Use the basic maketext, with defaults to get the raw template + template = Utils.maketext(templatefile, lang=lang, raw=1) + _templatecache[(templatefile, lang)] = template + # Copied from Utils.maketext() + text = template + if dict is not None: + try: + sdict = SafeDict(dict) + try: + text = sdict.interpolate(template) + except UnicodeError: + # Try again after coercing the template to unicode + utemplate = unicode(template, + Utils.GetCharSet(lang), + 'replace') + text = sdict.interpolate(utemplate) + except (TypeError, ValueError): + # The template is really screwed up + pass + # Make sure the text is in the given character set, or html-ify any bogus + # characters. + return Utils.uncanonstr(text, lang) + + + +# Note: I'm overriding most, if not all of the pipermail Article class +# here -ddm +# The Article class encapsulates a single posting. The attributes are: +# +# sequence : Sequence number, unique for each article in a set of archives +# subject : Subject +# datestr : The posting date, in human-readable format +# date : The posting date, in purely numeric format +# fromdate : The posting date, in `unixfrom' format +# headers : Any other headers of interest +# author : The author's name (and possibly organization) +# email : The author's e-mail address +# msgid : A unique message ID +# in_reply_to : If !="", this is the msgid of the article being replied to +# references: A (possibly empty) list of msgid's of earlier articles in +# the thread +# body : A list of strings making up the message body + +class Article(pipermail.Article): + __super_init = pipermail.Article.__init__ + __super_set_date = pipermail.Article._set_date + + _last_article_time = time.time() + + def __init__(self, message=None, sequence=0, keepHeaders=[], + lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None): + self.__super_init(message, sequence, keepHeaders) + self.prev = None + self.next = None + # Trim Re: from the subject line + i = 0 + while i != -1: + result = REpat.match(self.subject) + if result: + i = result.end(0) + self.subject = self.subject[i:] + else: + i = -1 + # Useful to keep around + self._lang = lang + self._mlist = mlist + + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + # Avoid i18n side-effects. Note that the language for this + # article (for this list) could be different from the site-wide + # preferred language, so we need to ensure no side-effects will + # occur. Think what happens when executing bin/arch. + otrans = i18n.get_translation() + try: + i18n.set_language(lang) + self.email = re.sub('@', _(' at '), self.email) + finally: + i18n.set_translation(otrans) + + # Snag the content-* headers. RFC 1521 states that their values are + # case insensitive. + ctype = message.get('Content-Type', 'text/plain') + cenc = message.get('Content-Transfer-Encoding', '') + self.ctype = ctype.lower() + self.cenc = cenc.lower() + self.decoded = {} + charset = message.get_param('charset') + if charset: + charset = charset.lower().strip() + if charset[0]=='"' and charset[-1]=='"': + charset = charset[1:-1] + if charset[0]=="'" and charset[-1]=="'": + charset = charset[1:-1] + try: + body = message.get_payload(decode=1) + except binascii.Error: + body = None + if body and charset != Utils.GetCharSet(self._lang): + # decode body + try: + body = unicode(body, charset) + except (UnicodeError, LookupError): + body = None + if body: + self.body = [l + "\n" for l in body.splitlines()] + + self.decode_headers() + + # Mapping of listnames to MailList instances as a weak value dictionary. + # This code is copied from Runner.py but there's one important operational + # difference. In Runner.py, we always .Load() the MailList object for + # each _dispose() run, otherwise the object retrieved from the cache won't + # be up-to-date. Since we're creating a new HyperArchive instance for + # each message being archived, we don't need to worry about that -- but it + # does mean there are additional opportunities for optimization. + _listcache = weakref.WeakValueDictionary() + + def _open_list(self, listname): + # Cache the open list so that any use of the list within this process + # uses the same object. We use a WeakValueDictionary so that when the + # list is no longer necessary, its memory is freed. + mlist = self._listcache.get(listname) + if not mlist: + try: + mlist = MailList.MailList(listname, lock=0) + except Errors.MMListError, e: + syslog('error', 'error opening list: %s\n%s', listname, e) + return None + else: + self._listcache[listname] = mlist + return mlist + + def __getstate__(self): + d = self.__dict__.copy() + # We definitely don't want to pickle the MailList instance, so just + # pickle a reference to it. + if d.has_key('_mlist'): + mlist = d['_mlist'] + del d['_mlist'] + else: + mlist = None + if mlist: + d['__listname'] = self._mlist.internal_name() + else: + d['__listname'] = None + # Delete a few other things we don't want in the pickle + for attr in ('prev', 'next', 'body'): + if d.has_key(attr): + del d[attr] + d['body'] = [] + return d + + def __setstate__(self, d): + # For loading older Articles via pickle. All this stuff was added + # when Simone Piunni and Tokio Kikuchi i18n'ified Pipermail. See SF + # patch #594771. + self.__dict__ = d + listname = d.get('__listname') + if listname: + del d['__listname'] + d['_mlist'] = self._open_list(listname) + if not d.has_key('_lang'): + if hasattr(self, '_mlist'): + self._lang = self._mlist.preferred_language + else: + self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE + if not d.has_key('cenc'): + self.cenc = None + if not d.has_key('decoded'): + self.decoded = {} + + def setListIfUnset(self, mlist): + if getattr(self, '_mlist', None) is None: + self._mlist = mlist + + def quote(self, buf): + return html_quote(buf, self._lang) + + def decode_headers(self): + """MIME-decode headers. + + If the email, subject, or author attributes contain non-ASCII + characters using the encoded-word syntax of RFC 2047, decoded versions + of those attributes are placed in the self.decoded (a dictionary). + + If the list's charset differs from the header charset, an attempt is + made to decode the headers as Unicode. If that fails, they are left + undecoded. + """ + author = self.decode_charset(self.author) + subject = self.decode_charset(self.subject) + if author: + self.decoded['author'] = author + email = self.decode_charset(self.email) + if email: + self.decoded['email'] = email + if subject: + self.decoded['subject'] = subject + + def decode_charset(self, field): + if field.find("=?") == -1: + return None + # Get the decoded header as a list of (s, charset) tuples + pairs = decode_header(field) + # Use __unicode__() until we can guarantee Python 2.2 + try: + # Use a large number for maxlinelen so it won't get wrapped + h = make_header(pairs, 99999) + return h.__unicode__() + except (UnicodeError, LookupError): + # Unknown encoding + return None + # The last value for c will have the proper charset in it + return EMPTYSTRING.join([s for s, c in pairs]) + + def as_html(self): + d = self.__dict__.copy() + # avoid i18n side-effects + otrans = i18n.get_translation() + i18n.set_language(self._lang) + try: + d["prev"], d["prev_wsubj"] = self._get_prev() + d["next"], d["next_wsubj"] = self._get_next() + + d["email_html"] = self.quote(self.email) + d["title"] = self.quote(self.subject) + d["subject_html"] = self.quote(self.subject) + d["subject_url"] = url_quote(self.subject) + d["in_reply_to_url"] = url_quote(self.in_reply_to) + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + # Point the mailto url back to the list + author = re.sub('@', _(' at '), self.author) + emailurl = self._mlist.GetListEmail() + else: + author = self.author + emailurl = self.email + d["author_html"] = self.quote(author) + d["email_url"] = url_quote(emailurl) + d["datestr_html"] = self.quote(i18n.ctime(int(self.date))) + d["body"] = self._get_body() + d['listurl'] = self._mlist.GetScriptURL('listinfo', absolute=1) + d['listname'] = self._mlist.real_name + d['encoding'] = '' + finally: + i18n.set_translation(otrans) + + charset = Utils.GetCharSet(self._lang) + d["encoding"] = html_charset % charset + + self._add_decoded(d) + return quick_maketext( + 'article.html', d, + lang=self._lang, mlist=self._mlist) + + def _get_prev(self): + """Return the href and subject for the previous message""" + if self.prev: + subject = self._get_subject_enc(self.prev) + prev = ('<LINK REL="Previous" HREF="%s">' + % (url_quote(self.prev.filename))) + prev_wsubj = ('<LI>' + _('Previous message:') + + ' <A HREF="%s">%s\n</A></li>' + % (url_quote(self.prev.filename), + self.quote(subject))) + else: + prev = prev_wsubj = "" + return prev, prev_wsubj + + def _get_subject_enc(self, art): + """Return the subject of art, decoded if possible. + + If the charset of the current message and art match and the + article's subject is encoded, decode it. + """ + return art.decoded.get('subject', art.subject) + + def _get_next(self): + """Return the href and subject for the previous message""" + if self.next: + subject = self._get_subject_enc(self.next) + next = ('<LINK REL="Next" HREF="%s">' + % (url_quote(self.next.filename))) + next_wsubj = ('<LI>' + _('Next message:') + + ' <A HREF="%s">%s\n</A></li>' + % (url_quote(self.next.filename), + self.quote(subject))) + else: + next = next_wsubj = "" + return next, next_wsubj + + _rx_quote = re.compile('=([A-F0-9][A-F0-9])') + _rx_softline = re.compile('=[ \t]*$') + + def _get_body(self): + """Return the message body ready for HTML, decoded if necessary""" + try: + body = self.html_body + except AttributeError: + body = self.body + return null_to_space(EMPTYSTRING.join(body)) + + def _add_decoded(self, d): + """Add encoded-word keys to HTML output""" + for src, dst in (('author', 'author_html'), + ('email', 'email_html'), + ('subject', 'subject_html'), + ('subject', 'title')): + if self.decoded.has_key(src): + d[dst] = self.quote(self.decoded[src]) + + def as_text(self): + d = self.__dict__.copy() + # We need to guarantee a valid From_ line, even if there are + # bososities in the headers. + if not d.get('fromdate', '').strip(): + d['fromdate'] = time.ctime(time.time()) + if not d.get('email', '').strip(): + d['email'] = 'bogus@does.not.exist.com' + if not d.get('datestr', '').strip(): + d['datestr'] = time.ctime(time.time()) + # + headers = ['From %(email)s %(fromdate)s', + 'From: %(email)s (%(author)s)', + 'Date: %(datestr)s', + 'Subject: %(subject)s'] + if d['_in_reply_to']: + headers.append('In-Reply-To: %(_in_reply_to)s') + if d['_references']: + headers.append('References: %(_references)s') + if d['_message_id']: + headers.append('Message-ID: %(_message_id)s') + body = EMPTYSTRING.join(self.body) + if isinstance(body, types.UnicodeType): + body = body.encode(Utils.GetCharSet(self._lang), 'replace') + return NL.join(headers) % d + '\n\n' + body + + def _set_date(self, message): + self.__super_set_date(message) + self.fromdate = time.ctime(int(self.date)) + + def loadbody_fromHTML(self,fileobj): + self.body = [] + begin = 0 + while 1: + line = fileobj.readline() + if not line: + break + if not begin: + if line.strip() == '<!--beginarticle-->': + begin = 1 + continue + if line.strip() == '<!--endarticle-->': + break + self.body.append(line) + + + +class HyperArchive(pipermail.T): + __super_init = pipermail.T.__init__ + __super_update_archive = pipermail.T.update_archive + __super_update_dirty_archives = pipermail.T.update_dirty_archives + __super_add_article = pipermail.T.add_article + + # some defaults + DIRMODE = 02775 + FILEMODE = 0660 + + VERBOSE = 0 + DEFAULTINDEX = 'thread' + ARCHIVE_PERIOD = 'month' + + THREADLAZY = 0 + THREADLEVELS = 3 + + ALLOWHTML = 1 # "Lines between <html></html>" handled as is. + SHOWHTML = 0 # Eg, nuke leading whitespace in html manner. + IQUOTES = 1 # Italicize quoted text. + SHOWBR = 0 # Add <br> onto every line + + def __init__(self, maillist): + # can't init the database while other processes are writing to it! + # XXX TODO- implement native locking + # with mailman's LockFile module for HyperDatabase.HyperDatabase + # + dir = maillist.archive_dir() + db = HyperDatabase.HyperDatabase(dir, maillist) + self.__super_init(dir, reload=1, database=db) + + self.maillist = maillist + self._lock_file = None + self.lang = maillist.preferred_language + self.charset = Utils.GetCharSet(maillist.preferred_language) + + if hasattr(self.maillist,'archive_volume_frequency'): + if self.maillist.archive_volume_frequency == 0: + self.ARCHIVE_PERIOD='year' + elif self.maillist.archive_volume_frequency == 2: + self.ARCHIVE_PERIOD='quarter' + elif self.maillist.archive_volume_frequency == 3: + self.ARCHIVE_PERIOD='week' + elif self.maillist.archive_volume_frequency == 4: + self.ARCHIVE_PERIOD='day' + else: + self.ARCHIVE_PERIOD='month' + + yre = r'(?P<year>[0-9]{4,4})' + mre = r'(?P<month>[01][0-9])' + dre = r'(?P<day>[0123][0-9])' + self._volre = { + 'year': '^' + yre + '$', + 'quarter': '^' + yre + r'q(?P<quarter>[1234])$', + 'month': '^' + yre + r'-(?P<month>[a-zA-Z]+)$', + 'week': r'^Week-of-Mon-' + yre + mre + dre, + 'day': '^' + yre + mre + dre + '$' + } + + def _makeArticle(self, msg, sequence): + return Article(msg, sequence, + lang=self.maillist.preferred_language, + mlist=self.maillist) + + def html_foot(self): + # avoid i18n side-effects + mlist = self.maillist + otrans = i18n.get_translation() + i18n.set_language(mlist.preferred_language) + # Convenience + def quotetime(s): + return html_quote(i18n.ctime(s), self.lang) + try: + d = {"lastdate": quotetime(self.lastdate), + "archivedate": quotetime(self.archivedate), + "listinfo": mlist.GetScriptURL('listinfo', absolute=1), + "version": self.version, + } + i = {"thread": _("thread"), + "subject": _("subject"), + "author": _("author"), + "date": _("date") + } + finally: + i18n.set_translation(otrans) + + for t in i.keys(): + cap = t[0].upper() + t[1:] + if self.type == cap: + d["%s_ref" % (t)] = "" + else: + d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>' + % (t, i[t])) + return quick_maketext( + 'archidxfoot.html', d, + mlist=mlist) + + def html_head(self): + # avoid i18n side-effects + mlist = self.maillist + otrans = i18n.get_translation() + i18n.set_language(mlist.preferred_language) + # Convenience + def quotetime(s): + return html_quote(i18n.ctime(s), self.lang) + try: + d = {"listname": html_quote(mlist.real_name, self.lang), + "archtype": self.type, + "archive": self.volNameToDesc(self.archive), + "listinfo": mlist.GetScriptURL('listinfo', absolute=1), + "firstdate": quotetime(self.firstdate), + "lastdate": quotetime(self.lastdate), + "size": self.size, + } + i = {"thread": _("thread"), + "subject": _("subject"), + "author": _("author"), + "date": _("date"), + } + finally: + i18n.set_translation(otrans) + + for t in i.keys(): + cap = t[0].upper() + t[1:] + if self.type == cap: + d["%s_ref" % (t)] = "" + d["archtype"] = i[t] + else: + d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>' + % (t, i[t])) + if self.charset: + d["encoding"] = html_charset % self.charset + else: + d["encoding"] = "" + return quick_maketext( + 'archidxhead.html', d, + mlist=mlist) + + def html_TOC(self): + mlist = self.maillist + listname = mlist.internal_name() + mbox = os.path.join(mlist.archive_dir()+'.mbox', listname+'.mbox') + d = {"listname": mlist.real_name, + "listinfo": mlist.GetScriptURL('listinfo', absolute=1), + "fullarch": '../%s.mbox/%s.mbox' % (listname, listname), + "size": sizeof(mbox, mlist.preferred_language), + 'meta': '', + } + # Avoid i18n side-effects + otrans = i18n.get_translation() + i18n.set_language(mlist.preferred_language) + try: + if not self.archives: + d["noarchive_msg"] = _( + '<P>Currently, there are no archives. </P>') + d["archive_listing_start"] = "" + d["archive_listing_end"] = "" + d["archive_listing"] = "" + else: + d["noarchive_msg"] = "" + d["archive_listing_start"] = quick_maketext( + 'archliststart.html', + lang=mlist.preferred_language, + mlist=mlist) + d["archive_listing_end"] = quick_maketext( + 'archlistend.html', + mlist=mlist) + + accum = [] + for a in self.archives: + accum.append(self.html_TOC_entry(a)) + d["archive_listing"] = EMPTYSTRING.join(accum) + finally: + i18n.set_translation(otrans) + + # The TOC is always in the charset of the list's preferred language + d['meta'] += html_charset % Utils.GetCharSet(mlist.preferred_language) + + return quick_maketext( + 'archtoc.html', d, + mlist=mlist) + + def html_TOC_entry(self, arch): + # Check to see if the archive is gzip'd or not + txtfile = os.path.join(self.maillist.archive_dir(), arch + '.txt') + gzfile = txtfile + '.gz' + # which exists? .txt.gz first, then .txt + if os.path.exists(gzfile): + file = gzfile + url = arch + '.txt.gz' + templ = '<td><A href="%(url)s">[ ' + _('Gzip\'d Text%(sz)s') \ + + ']</a></td>' + elif os.path.exists(txtfile): + file = txtfile + url = arch + '.txt' + templ = '<td><A href="%(url)s">[ ' + _('Text%(sz)s') + ']</a></td>' + else: + # neither found? + file = None + # in Python 1.5.2 we have an easy way to get the size + if file: + textlink = templ % { + 'url': url, + 'sz' : sizeof(file, self.maillist.preferred_language) + } + else: + # there's no archive file at all... hmmm. + textlink = '' + return quick_maketext( + 'archtocentry.html', + {'archive': arch, + 'archivelabel': self.volNameToDesc(arch), + 'textlink': textlink + }, + mlist=self.maillist) + + def GetArchLock(self): + if self._lock_file: + return 1 + self._lock_file = LockFile.LockFile( + os.path.join(mm_cfg.LOCK_DIR, + self.maillist.internal_name() + '-arch.lock')) + try: + self._lock_file.lock(timeout=0.5) + except LockFile.TimeOutError: + return 0 + return 1 + + def DropArchLock(self): + if self._lock_file: + self._lock_file.unlock(unconditionally=1) + self._lock_file = None + + def processListArch(self): + name = self.maillist.ArchiveFileName() + wname= name+'.working' + ename= name+'.err_unarchived' + try: + os.stat(name) + except (IOError,os.error): + #no archive file, nothin to do -ddm + return + + #see if arch is locked here -ddm + if not self.GetArchLock(): + #another archiver is running, nothing to do. -ddm + return + + #if the working file is still here, the archiver may have + # crashed during archiving. Save it, log an error, and move on. + try: + wf = open(wname) + syslog('error', + 'Archive working file %s present. ' + 'Check %s for possibly unarchived msgs', + wname, ename) + omask = os.umask(007) + try: + ef = open(ename, 'a+') + finally: + os.umask(omask) + ef.seek(1,2) + if ef.read(1) <> '\n': + ef.write('\n') + ef.write(wf.read()) + ef.close() + wf.close() + os.unlink(wname) + except IOError: + pass + os.rename(name,wname) + archfile = open(wname) + self.processUnixMailbox(archfile) + archfile.close() + os.unlink(wname) + self.DropArchLock() + + def get_filename(self, article): + return '%06i.html' % (article.sequence,) + + def get_archives(self, article): + """Return a list of indexes where the article should be filed. + A string can be returned if the list only contains one entry, + and the empty list is legal.""" + res = self.dateToVolName(float(article.date)) + self.message(_("figuring article archives\n")) + self.message(res + "\n") + return res + + def volNameToDesc(self, volname): + volname = volname.strip() + # Don't make these module global constants since we have to runtime + # translate them anyway. + monthdict = [ + '', + _('January'), _('February'), _('March'), _('April'), + _('May'), _('June'), _('July'), _('August'), + _('September'), _('October'), _('November'), _('December') + ] + for each in self._volre.keys(): + match = re.match(self._volre[each], volname) + # Let ValueErrors percolate up + if match: + year = int(match.group('year')) + if each == 'quarter': + d =["", _("First"), _("Second"), _("Third"), _("Fourth") ] + ord = d[int(match.group('quarter'))] + return _("%(ord)s quarter %(year)i") + elif each == 'month': + monthstr = match.group('month').lower() + for i in range(1, 13): + monthname = time.strftime("%B", (1999,i,1,0,0,0,0,1,0)) + if monthstr.lower() == monthname.lower(): + month = monthdict[i] + return _("%(month)s %(year)i") + raise ValueError, "%s is not a month!" % monthstr + elif each == 'week': + month = monthdict[int(match.group("month"))] + day = int(match.group("day")) + return _("The Week Of Monday %(day)i %(month)s %(year)i") + elif each == 'day': + month = monthdict[int(match.group("month"))] + day = int(match.group("day")) + return _("%(day)i %(month)s %(year)i") + else: + return match.group('year') + raise ValueError, "%s is not a valid volname" % volname + +# The following two methods should be inverses of each other. -ddm + + def dateToVolName(self,date): + datetuple=time.localtime(date) + if self.ARCHIVE_PERIOD=='year': + return time.strftime("%Y",datetuple) + elif self.ARCHIVE_PERIOD=='quarter': + if datetuple[1] in [1,2,3]: + return time.strftime("%Yq1",datetuple) + elif datetuple[1] in [4,5,6]: + return time.strftime("%Yq2",datetuple) + elif datetuple[1] in [7,8,9]: + return time.strftime("%Yq3",datetuple) + else: + return time.strftime("%Yq4",datetuple) + elif self.ARCHIVE_PERIOD == 'day': + return time.strftime("%Y%m%d", datetuple) + elif self.ARCHIVE_PERIOD == 'week': + # Reconstruct "seconds since epoch", and subtract weekday + # multiplied by the number of seconds in a day. + monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60 + # Build a new datetuple from this "seconds since epoch" value + datetuple = time.localtime(monday) + return time.strftime("Week-of-Mon-%Y%m%d", datetuple) + # month. -ddm + else: + return time.strftime("%Y-%B",datetuple) + + + def volNameToDate(self,volname): + volname = volname.strip() + for each in self._volre.keys(): + match=re.match(self._volre[each],volname) + if match: + year=int(match.group('year')) + month=1 + day = 1 + if each == 'quarter': + q=int(match.group('quarter')) + month=(q*3)-2 + elif each == 'month': + monthstr=match.group('month').lower() + m=[] + for i in range(1,13): + m.append( + time.strftime("%B",(1999,i,1,0,0,0,0,1,0)).lower()) + try: + month=m.index(monthstr)+1 + except ValueError: + pass + elif each == 'week' or each == 'day': + month = int(match.group("month")) + day = int(match.group("day")) + return time.mktime((year,month,1,0,0,0,0,1,-1)) + return 0.0 + + def sortarchives(self): + def sf(a,b,s=self): + al=s.volNameToDate(a) + bl=s.volNameToDate(b) + if al>bl: + return 1 + elif al<bl: + return -1 + else: + return 0 + if self.ARCHIVE_PERIOD in ('month','year','quarter'): + self.archives.sort(sf) + else: + self.archives.sort() + self.archives.reverse() + + def message(self, msg): + if self.VERBOSE: + f = sys.stderr + f.write(msg) + if msg[-1:] != '\n': + f.write('\n') + f.flush() + + def open_new_archive(self, archive, archivedir): + index_html = os.path.join(archivedir, 'index.html') + try: + os.unlink(index_html) + except: + pass + os.symlink(self.DEFAULTINDEX+'.html',index_html) + + def write_index_header(self): + self.depth=0 + print self.html_head() + if not self.THREADLAZY and self.type=='Thread': + self.message(_("Computing threaded index\n")) + self.updateThreadedIndex() + + def write_index_footer(self): + for i in range(self.depth): + print '</UL>' + print self.html_foot() + + def write_index_entry(self, article): + subject = self.get_header("subject", article) + author = self.get_header("author", article) + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + author = re.sub('@', _(' at '), author) + subject = CGIescape(subject, self.lang) + author = CGIescape(author, self.lang) + + d = { + 'filename': urllib.quote(article.filename), + 'subject': subject, + 'sequence': article.sequence, + 'author': author + } + print quick_maketext( + 'archidxentry.html', d, + mlist=self.maillist) + + def get_header(self, field, article): + # if we have no decoded header, return the encoded one + result = article.decoded.get(field) + if result is None: + return getattr(article, field) + # otherwise, the decoded one will be Unicode + return result + + def write_threadindex_entry(self, article, depth): + if depth < 0: + self.message('depth<0') + depth = 0 + if depth > self.THREADLEVELS: + depth = self.THREADLEVELS + if depth < self.depth: + for i in range(self.depth-depth): + print '</UL>' + elif depth > self.depth: + for i in range(depth-self.depth): + print '<UL>' + print '<!--%i %s -->' % (depth, article.threadKey) + self.depth = depth + self.write_index_entry(article) + + def write_TOC(self): + self.sortarchives() + omask = os.umask(002) + try: + toc = open(os.path.join(self.basedir, 'index.html'), 'w') + finally: + os.umask(omask) + toc.write(self.html_TOC()) + toc.close() + + def write_article(self, index, article, path): + # called by add_article + omask = os.umask(002) + try: + f = open(path, 'w') + finally: + os.umask(omask) + f.write(article.as_html()) + f.close() + + # Write the text article to the text archive. + path = os.path.join(self.basedir, "%s.txt" % index) + omask = os.umask(002) + try: + f = open(path, 'a+') + finally: + os.umask(omask) + f.write(article.as_text()) + f.close() + + def update_archive(self, archive): + self.__super_update_archive(archive) + # only do this if the gzip module was imported globally, and + # gzip'ing was enabled via mm_cfg.GZIP_ARCHIVE_TXT_FILES. See + # above. + if gzip: + archz = None + archt = None + txtfile = os.path.join(self.basedir, '%s.txt' % archive) + gzipfile = os.path.join(self.basedir, '%s.txt.gz' % archive) + oldgzip = os.path.join(self.basedir, '%s.old.txt.gz' % archive) + try: + # open the plain text file + archt = open(txtfile) + except IOError: + return + try: + os.rename(gzipfile, oldgzip) + archz = gzip.open(oldgzip) + except (IOError, RuntimeError, os.error): + pass + try: + ou = os.umask(002) + newz = gzip.open(gzipfile, 'w') + finally: + # XXX why is this a finally? + os.umask(ou) + if archz: + newz.write(archz.read()) + archz.close() + os.unlink(oldgzip) + # XXX do we really need all this in a try/except? + try: + newz.write(archt.read()) + newz.close() + archt.close() + except IOError: + pass + os.unlink(txtfile) + + _skip_attrs = ('maillist', '_lock_file', 'charset') + + def getstate(self): + d={} + for each in self.__dict__.keys(): + if not (each in self._skip_attrs + or each.upper() == each): + d[each] = self.__dict__[each] + return d + + # Add <A HREF="..."> tags around URLs and e-mail addresses. + + def __processbody_URLquote(self, lines): + # XXX a lot to do here: + # 1. use lines directly, rather than source and dest + # 2. make it clearer + # 3. make it faster + source = lines[:] + dest = lines + last_line_was_quoted = 0 + for i in xrange(0, len(source)): + Lorig = L = source[i] + prefix = suffix = "" + if L is None: + continue + # Italicise quoted text + if self.IQUOTES: + quoted = quotedpat.match(L) + if quoted is None: + last_line_was_quoted = 0 + else: + quoted = quoted.end(0) + prefix = CGIescape(L[:quoted], self.lang) + '<i>' + suffix = '</I>' + if self.SHOWHTML: + suffix += '<BR>' + if not last_line_was_quoted: + prefix = '<BR>' + prefix + L = L[quoted:] + last_line_was_quoted = 1 + # Check for an e-mail address + L2 = "" + jr = emailpat.search(L) + kr = urlpat.search(L) + while jr is not None or kr is not None: + if jr == None: + j = -1 + else: + j = jr.start(0) + if kr is None: + k = -1 + else: + k = kr.start(0) + if j != -1 and (j < k or k == -1): + text = jr.group(1) + length = len(text) + if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: + text = re.sub('@', _(' at '), text) + URL = self.maillist.GetScriptURL( + 'listinfo', absolute=1) + else: + URL = 'mailto:' + text + pos = j + elif k != -1 and (j > k or j == -1): + text = URL = kr.group(1) + length = len(text) + pos = k + else: # j==k + raise ValueError, "j==k: This can't happen!" + #length = len(text) + #self.message("URL: %s %s %s \n" + # % (CGIescape(L[:pos]), URL, CGIescape(text))) + L2 += '%s<A HREF="%s">%s</A>' % ( + CGIescape(L[:pos], self.lang), + html_quote(URL), CGIescape(text, self.lang)) + L = L[pos+length:] + jr = emailpat.search(L) + kr = urlpat.search(L) + if jr is None and kr is None: + L = CGIescape(L, self.lang) + L = prefix + L2 + L + suffix + source[i] = None + dest[i] = L + + # Perform Hypermail-style processing of <HTML></HTML> directives + # in message bodies. Lines between <HTML> and </HTML> will be written + # out precisely as they are; other lines will be passed to func2 + # for further processing . + + def __processbody_HTML(self, lines): + # XXX need to make this method modify in place + source = lines[:] + dest = lines + l = len(source) + i = 0 + while i < l: + while i < l and htmlpat.match(source[i]) is None: + i = i + 1 + if i < l: + source[i] = None + i = i + 1 + while i < l and nohtmlpat.match(source[i]) is None: + dest[i], source[i] = source[i], None + i = i + 1 + if i < l: + source[i] = None + i = i + 1 + + def format_article(self, article): + # called from add_article + # TBD: Why do the HTML formatting here and keep it in the + # pipermail database? It makes more sense to do the html + # formatting as the article is being written as html and toss + # the data after it has been written to the archive file. + lines = filter(None, article.body) + # Handle <HTML> </HTML> directives + if self.ALLOWHTML: + self.__processbody_HTML(lines) + self.__processbody_URLquote(lines) + if not self.SHOWHTML and lines: + lines.insert(0, '<PRE>') + lines.append('</PRE>') + else: + # Do fancy formatting here + if self.SHOWBR: + lines = map(lambda x:x + "<BR>", lines) + else: + for i in range(0, len(lines)): + s = lines[i] + if s[0:1] in ' \t\n': + lines[i] = '<P>' + s + article.html_body = lines + return article + + def update_article(self, arcdir, article, prev, next): + seq = article.sequence + filename = os.path.join(arcdir, article.filename) + self.message(_('Updating HTML for article %(seq)s')) + try: + f = open(filename) + article.loadbody_fromHTML(f) + f.close() + except IOError, e: + if e.errno <> errno.ENOENT: raise + self.message(_('article file %(filename)s is missing!')) + article.prev = prev + article.next = next + omask = os.umask(002) + try: + f = open(filename, 'w') + finally: + os.umask(omask) + f.write(article.as_html()) + f.close() |